Commit e6395b68 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus-v3.10-rc5' of git://oss.sgi.com/xfs/xfs

Pull more xfs updates from Ben Myers:
 "Here are several fixes for filesystems with CRC support turned on:
  fixes for quota, remote attributes, and recovery.  There is also some
  feature work related to CRCs: the implementation of CRCs for the inode
  unlinked lists, disabling noattr2/attr2 options when appropriate, and
  bumping the maximum number of ACLs.

  I would have preferred to defer this last category of items to 3.11.
  This would require setting a feature bit for the on-disk changes, so
  there is some pressure to get these in 3.10.  I believe this
  represents the end of the CRC related queue.

   - Rework of dquot CRCs
   - Fix for remote attribute invalidation of a leaf
   - Fix ordering of transaction replay in recovery
   - Implement CRCs for inode unlinked list
   - Disable noattr2/attr2 mount options when CRCs are enabled
   - Bump the limitation of ACL entries for v5 superblocks"

* tag 'for-linus-v3.10-rc5' of git://oss.sgi.com/xfs/xfs:
  xfs: increase number of ACL entries for V5 superblocks
  xfs: disable noattr2/attr2 mount options for CRC enabled filesystems
  xfs: inode unlinked list needs to recalculate the inode CRC
  xfs: fix log recovery transaction item reordering
  xfs: fix remote attribute invalidation for a leaf
  xfs: rework dquot CRCs
parents 29eb7782 0a8aa193
......@@ -33,6 +33,9 @@ When mounting an XFS filesystem, the following options are accepted.
removing extended attributes) the on-disk superblock feature
bit field will be updated to reflect this format being in use.
CRC enabled filesystems always use the attr2 format, and so
will reject the noattr2 mount option if it is set.
barrier
Enables the use of block layer write barriers for writes into
the journal and unwritten extent conversion. This allows for
......
......@@ -21,6 +21,8 @@
#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
#include "xfs_vnodeops.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_trace.h"
#include <linux/slab.h>
#include <linux/xattr.h>
......@@ -34,7 +36,9 @@
*/
STATIC struct posix_acl *
xfs_acl_from_disk(struct xfs_acl *aclp)
xfs_acl_from_disk(
struct xfs_acl *aclp,
int max_entries)
{
struct posix_acl_entry *acl_e;
struct posix_acl *acl;
......@@ -42,7 +46,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp)
unsigned int count, i;
count = be32_to_cpu(aclp->acl_cnt);
if (count > XFS_ACL_MAX_ENTRIES)
if (count > max_entries)
return ERR_PTR(-EFSCORRUPTED);
acl = posix_acl_alloc(count, GFP_KERNEL);
......@@ -108,9 +112,9 @@ xfs_get_acl(struct inode *inode, int type)
struct xfs_inode *ip = XFS_I(inode);
struct posix_acl *acl;
struct xfs_acl *xfs_acl;
int len = sizeof(struct xfs_acl);
unsigned char *ea_name;
int error;
int len;
acl = get_cached_acl(inode, type);
if (acl != ACL_NOT_CACHED)
......@@ -133,8 +137,8 @@ xfs_get_acl(struct inode *inode, int type)
* If we have a cached ACLs value just return it, not need to
* go out to the disk.
*/
xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
len = XFS_ACL_MAX_SIZE(ip->i_mount);
xfs_acl = kzalloc(len, GFP_KERNEL);
if (!xfs_acl)
return ERR_PTR(-ENOMEM);
......@@ -153,7 +157,7 @@ xfs_get_acl(struct inode *inode, int type)
goto out;
}
acl = xfs_acl_from_disk(xfs_acl);
acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
if (IS_ERR(acl))
goto out;
......@@ -189,16 +193,17 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
if (acl) {
struct xfs_acl *xfs_acl;
int len;
int len = XFS_ACL_MAX_SIZE(ip->i_mount);
xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
xfs_acl = kzalloc(len, GFP_KERNEL);
if (!xfs_acl)
return -ENOMEM;
xfs_acl_to_disk(xfs_acl, acl);
len = sizeof(struct xfs_acl) -
(sizeof(struct xfs_acl_entry) *
(XFS_ACL_MAX_ENTRIES - acl->a_count));
/* subtract away the unused acl entries */
len -= sizeof(struct xfs_acl_entry) *
(XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
len, ATTR_ROOT);
......@@ -243,7 +248,7 @@ xfs_set_mode(struct inode *inode, umode_t mode)
static int
xfs_acl_exists(struct inode *inode, unsigned char *name)
{
int len = sizeof(struct xfs_acl);
int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
ATTR_ROOT|ATTR_KERNOVAL) == 0);
......@@ -379,7 +384,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
goto out_release;
error = -EINVAL;
if (acl->a_count > XFS_ACL_MAX_ENTRIES)
if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
goto out_release;
if (type == ACL_TYPE_ACCESS) {
......
......@@ -22,19 +22,36 @@ struct inode;
struct posix_acl;
struct xfs_inode;
#define XFS_ACL_MAX_ENTRIES 25
#define XFS_ACL_NOT_PRESENT (-1)
/* On-disk XFS access control list structure */
struct xfs_acl_entry {
__be32 ae_tag;
__be32 ae_id;
__be16 ae_perm;
__be16 ae_pad; /* fill the implicit hole in the structure */
};
struct xfs_acl {
__be32 acl_cnt;
struct xfs_acl_entry {
__be32 ae_tag;
__be32 ae_id;
__be16 ae_perm;
} acl_entry[XFS_ACL_MAX_ENTRIES];
__be32 acl_cnt;
struct xfs_acl_entry acl_entry[0];
};
/*
* The number of ACL entries allowed is defined by the on-disk format.
* For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
* limited only by the maximum size of the xattr that stores the information.
*/
#define XFS_ACL_MAX_ENTRIES(mp) \
(xfs_sb_version_hascrc(&mp->m_sb) \
? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
sizeof(struct xfs_acl_entry) \
: 25)
#define XFS_ACL_MAX_SIZE(mp) \
(sizeof(struct xfs_acl) + \
sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
/* On-disk XFS extended attribute names */
#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
......
......@@ -3258,7 +3258,7 @@ xfs_attr3_leaf_inactive(
name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
if (name_rmt->valueblk) {
lp->valueblk = be32_to_cpu(name_rmt->valueblk);
lp->valuelen = XFS_B_TO_FSB(dp->i_mount,
lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
be32_to_cpu(name_rmt->valuelen));
lp++;
}
......
......@@ -249,8 +249,11 @@ xfs_qm_init_dquot_blk(
d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
d->dd_diskdq.d_id = cpu_to_be32(curid);
d->dd_diskdq.d_flags = type;
if (xfs_sb_version_hascrc(&mp->m_sb))
if (xfs_sb_version_hascrc(&mp->m_sb)) {
uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
}
xfs_trans_dquot_buf(tp, bp,
......@@ -286,23 +289,6 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
}
STATIC void
xfs_dquot_buf_calc_crc(
struct xfs_mount *mp,
struct xfs_buf *bp)
{
struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
int i;
if (!xfs_sb_version_hascrc(&mp->m_sb))
return;
for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++, d++) {
xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
offsetof(struct xfs_dqblk, dd_crc));
}
}
STATIC bool
xfs_dquot_buf_verify_crc(
struct xfs_mount *mp,
......@@ -328,12 +314,11 @@ xfs_dquot_buf_verify_crc(
for (i = 0; i < ndquots; i++, d++) {
if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
offsetof(struct xfs_dqblk, dd_crc)))
XFS_DQUOT_CRC_OFF))
return false;
if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
return false;
}
return true;
}
......@@ -393,6 +378,11 @@ xfs_dquot_buf_read_verify(
}
}
/*
* we don't calculate the CRC here as that is done when the dquot is flushed to
* the buffer after the update is done. This ensures that the dquot in the
* buffer always has an up-to-date CRC value.
*/
void
xfs_dquot_buf_write_verify(
struct xfs_buf *bp)
......@@ -404,7 +394,6 @@ xfs_dquot_buf_write_verify(
xfs_buf_ioerror(bp, EFSCORRUPTED);
return;
}
xfs_dquot_buf_calc_crc(mp, bp);
}
const struct xfs_buf_ops xfs_dquot_buf_ops = {
......@@ -1151,11 +1140,17 @@ xfs_qm_dqflush(
* copy the lsn into the on-disk dquot now while we have the in memory
* dquot here. This can't be done later in the write verifier as we
* can't get access to the log item at that point in time.
*
* We also calculate the CRC here so that the on-disk dquot in the
* buffer always has a valid CRC. This ensures there is no possibility
* of a dquot without an up-to-date CRC getting to disk.
*/
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp;
dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
/*
......
......@@ -1638,6 +1638,10 @@ xfs_iunlink(
dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
/* need to recalc the inode CRC if appropriate */
xfs_dinode_calc_crc(mp, dip);
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
......@@ -1723,6 +1727,10 @@ xfs_iunlink_remove(
dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
/* need to recalc the inode CRC if appropriate */
xfs_dinode_calc_crc(mp, dip);
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
......@@ -1796,6 +1804,10 @@ xfs_iunlink_remove(
dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
/* need to recalc the inode CRC if appropriate */
xfs_dinode_calc_crc(mp, dip);
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
......@@ -1809,6 +1821,10 @@ xfs_iunlink_remove(
last_dip->di_next_unlinked = cpu_to_be32(next_agino);
ASSERT(next_agino != 0);
offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
/* need to recalc the inode CRC if appropriate */
xfs_dinode_calc_crc(mp, last_dip);
xfs_trans_inode_buf(tp, last_ibp);
xfs_trans_log_buf(tp, last_ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
......
......@@ -1599,10 +1599,43 @@ xlog_recover_add_to_trans(
}
/*
* Sort the log items in the transaction. Cancelled buffers need
* to be put first so they are processed before any items that might
* modify the buffers. If they are cancelled, then the modifications
* don't need to be replayed.
* Sort the log items in the transaction.
*
* The ordering constraints are defined by the inode allocation and unlink
* behaviour. The rules are:
*
* 1. Every item is only logged once in a given transaction. Hence it
* represents the last logged state of the item. Hence ordering is
* dependent on the order in which operations need to be performed so
* required initial conditions are always met.
*
* 2. Cancelled buffers are recorded in pass 1 in a separate table and
* there's nothing to replay from them so we can simply cull them
* from the transaction. However, we can't do that until after we've
* replayed all the other items because they may be dependent on the
* cancelled buffer and replaying the cancelled buffer can remove it
* form the cancelled buffer table. Hence they have tobe done last.
*
* 3. Inode allocation buffers must be replayed before inode items that
* read the buffer and replay changes into it.
*
* 4. Inode unlink buffers must be replayed after inode items are replayed.
* This ensures that inodes are completely flushed to the inode buffer
* in a "free" state before we remove the unlinked inode list pointer.
*
* Hence the ordering needs to be inode allocation buffers first, inode items
* second, inode unlink buffers third and cancelled buffers last.
*
* But there's a problem with that - we can't tell an inode allocation buffer
* apart from a regular buffer, so we can't separate them. We can, however,
* tell an inode unlink buffer from the others, and so we can separate them out
* from all the other buffers and move them to last.
*
* Hence, 4 lists, in order from head to tail:
* - buffer_list for all buffers except cancelled/inode unlink buffers
* - item_list for all non-buffer items
* - inode_buffer_list for inode unlink buffers
* - cancel_list for the cancelled buffers
*/
STATIC int
xlog_recover_reorder_trans(
......@@ -1612,6 +1645,10 @@ xlog_recover_reorder_trans(
{
xlog_recover_item_t *item, *n;
LIST_HEAD(sort_list);
LIST_HEAD(cancel_list);
LIST_HEAD(buffer_list);
LIST_HEAD(inode_buffer_list);
LIST_HEAD(inode_list);
list_splice_init(&trans->r_itemq, &sort_list);
list_for_each_entry_safe(item, n, &sort_list, ri_list) {
......@@ -1619,12 +1656,18 @@ xlog_recover_reorder_trans(
switch (ITEM_TYPE(item)) {
case XFS_LI_BUF:
if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
if (buf_f->blf_flags & XFS_BLF_CANCEL) {
trace_xfs_log_recover_item_reorder_head(log,
trans, item, pass);
list_move(&item->ri_list, &trans->r_itemq);
list_move(&item->ri_list, &cancel_list);
break;
}
if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
list_move(&item->ri_list, &inode_buffer_list);
break;
}
list_move_tail(&item->ri_list, &buffer_list);
break;
case XFS_LI_INODE:
case XFS_LI_DQUOT:
case XFS_LI_QUOTAOFF:
......@@ -1632,7 +1675,7 @@ xlog_recover_reorder_trans(
case XFS_LI_EFI:
trace_xfs_log_recover_item_reorder_tail(log,
trans, item, pass);
list_move_tail(&item->ri_list, &trans->r_itemq);
list_move_tail(&item->ri_list, &inode_list);
break;
default:
xfs_warn(log->l_mp,
......@@ -1643,6 +1686,14 @@ xlog_recover_reorder_trans(
}
}
ASSERT(list_empty(&sort_list));
if (!list_empty(&buffer_list))
list_splice(&buffer_list, &trans->r_itemq);
if (!list_empty(&inode_list))
list_splice_tail(&inode_list, &trans->r_itemq);
if (!list_empty(&inode_buffer_list))
list_splice_tail(&inode_buffer_list, &trans->r_itemq);
if (!list_empty(&cancel_list))
list_splice_tail(&cancel_list, &trans->r_itemq);
return 0;
}
......@@ -1861,6 +1912,15 @@ xlog_recover_do_inode_buffer(
buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
next_unlinked_offset);
*buffer_nextp = *logged_nextp;
/*
* If necessary, recalculate the CRC in the on-disk inode. We
* have to leave the inode in a consistent state for whoever
* reads it next....
*/
xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
}
return 0;
......@@ -2266,6 +2326,12 @@ xfs_qm_dqcheck(
d->dd_diskdq.d_flags = type;
d->dd_diskdq.d_id = cpu_to_be32(id);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
return errs;
}
......@@ -2793,6 +2859,10 @@ xlog_recover_dquot_pass2(
}
memcpy(ddq, recddq, item->ri_buf[1].i_len);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
ASSERT(dq_f->qlf_size == 2);
ASSERT(bp->b_target->bt_mount == mp);
......
......@@ -41,6 +41,7 @@
#include "xfs_qm.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_cksum.h"
/*
* The global quota manager. There is only one of these for the entire
......@@ -839,7 +840,7 @@ xfs_qm_reset_dqcounts(
xfs_dqid_t id,
uint type)
{
xfs_disk_dquot_t *ddq;
struct xfs_dqblk *dqb;
int j;
trace_xfs_reset_dqcounts(bp, _RET_IP_);
......@@ -853,8 +854,12 @@ xfs_qm_reset_dqcounts(
do_div(j, sizeof(xfs_dqblk_t));
ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
#endif
ddq = bp->b_addr;
dqb = bp->b_addr;
for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
struct xfs_disk_dquot *ddq;
ddq = (struct xfs_disk_dquot *)&dqb[j];
/*
* Do a sanity check, and if needed, repair the dqblk. Don't
* output any warnings because it's perfectly possible to
......@@ -871,7 +876,12 @@ xfs_qm_reset_dqcounts(
ddq->d_bwarns = 0;
ddq->d_iwarns = 0;
ddq->d_rtbwarns = 0;
ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
xfs_update_cksum((char *)&dqb[j],
sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
}
}
......@@ -907,19 +917,29 @@ xfs_qm_dqiter_bufs(
XFS_FSB_TO_DADDR(mp, bno),
mp->m_quotainfo->qi_dqchunklen, 0, &bp,
&xfs_dquot_buf_ops);
if (error)
break;
/*
* XXX(hch): need to figure out if it makes sense to validate
* the CRC here.
* CRC and validation errors will return a EFSCORRUPTED here. If
* this occurs, re-read without CRC validation so that we can
* repair the damage via xfs_qm_reset_dqcounts(). This process
* will leave a trace in the log indicating corruption has
* been detected.
*/
if (error == EFSCORRUPTED) {
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, bno),
mp->m_quotainfo->qi_dqchunklen, 0, &bp,
NULL);
}
if (error)
break;
xfs_qm_reset_dqcounts(mp, bp, firstid, type);
xfs_buf_delwri_queue(bp, buffer_list);
xfs_buf_relse(bp);
/*
* goto the next block.
*/
/* goto the next block. */
bno++;
firstid += mp->m_quotainfo->qi_dqperchunk;
}
......
......@@ -87,6 +87,8 @@ typedef struct xfs_dqblk {
uuid_t dd_uuid; /* location information */
} xfs_dqblk_t;
#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
/*
* flags for q_flags field in the dquot.
*/
......
......@@ -1372,6 +1372,17 @@ xfs_finish_flags(
}
}
/*
* V5 filesystems always use attr2 format for attributes.
*/
if (xfs_sb_version_hascrc(&mp->m_sb) &&
(mp->m_flags & XFS_MOUNT_NOATTR2)) {
xfs_warn(mp,
"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
MNTOPT_NOATTR2, MNTOPT_ATTR2);
return XFS_ERROR(EINVAL);
}
/*
* mkfs'ed attr2 will turn on attr2 mount unless explicitly
* told by noattr2 to turn it off
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment