Commit 6abc05cc authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] xattr: fine-grained locking

From: Andreas Gruenbacher <agruen@suse.de>

This patch removes the dependency on i_sem in the getxattr and
listxattr iops of ext2 and ext3. In addition, the global ext[23]_xattr
semaphores go away. Instead of i_sem and the global semaphore, mutual
exclusion is now ensured by per-inode xattr semaphores, and by locking
the buffers before modifying them. The detailed locking strategy is
described in comments in fs/ext[23]/xattr.c.

Due to this change it is no longer necessary to take i_sem in
ext[23]_permission() for retrieving acls, so the
ext[23]_permission_locked() functions go away.

Additionally, the patch fixes a race condition in ext[23]_permission:
Accessing inode->i_acl was protected by the BKL in 2.4; in 2.5 there no
longer is such protection. Instead, inode->i_acl (and inode->i_default_acl)
are now accessed under inode->i_lock. (This could be replaced by RCU in
the future.)

In the ext3 extended attribute code, an new uglines results from locking
at the buffer head level: The buffer lock must be held between testing
if an xattr block can be modified and the actual modification to prevent
races from happening. Before a block can be modified,
ext3_journal_get_write_access() must be called. But this requies an unlocked
buffer, so I call ext3_journal_get_write_access() before locking the
buffer. If it turns out that the buffer cannot be modified,
journal_release_buffer() is called. Calling ext3_journal_get_write_access
after the test but while the buffer is still locked would be much better.
parent 430cab6d
......@@ -124,14 +124,38 @@ ext2_acl_to_disk(const struct posix_acl *acl, size_t *size)
return ERR_PTR(-EINVAL);
}
static inline struct posix_acl *
ext2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
{
struct posix_acl *acl = EXT2_ACL_NOT_CACHED;
spin_lock(&inode->i_lock);
if (*i_acl != EXT2_ACL_NOT_CACHED)
acl = posix_acl_dup(*i_acl);
spin_unlock(&inode->i_lock);
return acl;
}
static inline void
ext2_iset_acl(struct inode *inode, struct posix_acl **i_acl,
struct posix_acl *acl)
{
spin_lock(&inode->i_lock);
if (*i_acl != EXT2_ACL_NOT_CACHED)
posix_acl_release(*i_acl);
*i_acl = posix_acl_dup(acl);
spin_unlock(&inode->i_lock);
}
/*
* inode->i_sem: down
* inode->i_sem: don't care
*/
static struct posix_acl *
ext2_get_acl(struct inode *inode, int type)
{
const size_t max_size = ext2_acl_size(EXT2_ACL_MAX_ENTRIES);
struct ext2_inode_inode *ei = EXT2_I(inode);
struct ext2_inode_info *ei = EXT2_I(inode);
int name_index;
char *value;
struct posix_acl *acl;
......@@ -142,14 +166,16 @@ ext2_get_acl(struct inode *inode, int type)
switch(type) {
case ACL_TYPE_ACCESS:
if (ei->i_acl != EXT2_ACL_NOT_CACHED)
return posix_acl_dup(ei->i_acl);
acl = ext2_iget_acl(inode, &ei->i_acl);
if (acl != EXT2_ACL_NOT_CACHED)
return acl;
name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
if (ei->i_default_acl != EXT2_ACL_NOT_CACHED)
return posix_acl_dup(ei->i_default_acl);
acl = ext2_iget_acl(inode, &ei->i_default_acl);
if (acl != EXT2_ACL_NOT_CACHED)
return acl;
name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
break;
......@@ -171,11 +197,11 @@ ext2_get_acl(struct inode *inode, int type)
if (!IS_ERR(acl)) {
switch(type) {
case ACL_TYPE_ACCESS:
ei->i_acl = posix_acl_dup(acl);
ext2_iset_acl(inode, &ei->i_acl, acl);
break;
case ACL_TYPE_DEFAULT:
ei->i_default_acl = posix_acl_dup(acl);
ext2_iset_acl(inode, &ei->i_default_acl, acl);
break;
}
}
......@@ -240,23 +266,24 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
if (!error) {
switch(type) {
case ACL_TYPE_ACCESS:
if (ei->i_acl != EXT2_ACL_NOT_CACHED)
posix_acl_release(ei->i_acl);
ei->i_acl = posix_acl_dup(acl);
ext2_iset_acl(inode, &ei->i_acl, acl);
break;
case ACL_TYPE_DEFAULT:
if (ei->i_default_acl != EXT2_ACL_NOT_CACHED)
posix_acl_release(ei->i_default_acl);
ei->i_default_acl = posix_acl_dup(acl);
ext2_iset_acl(inode, &ei->i_default_acl, acl);
break;
}
}
return error;
}
static int
__ext2_permission(struct inode *inode, int mask, int lock)
/*
* Inode operation permission().
*
* inode->i_sem: don't care
*/
int
ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
{
int mode = inode->i_mode;
......@@ -270,30 +297,16 @@ __ext2_permission(struct inode *inode, int mask, int lock)
if (current->fsuid == inode->i_uid) {
mode >>= 6;
} else if (test_opt(inode->i_sb, POSIX_ACL)) {
struct ext2_inode_info *ei = EXT2_I(inode);
struct posix_acl *acl;
/* The access ACL cannot grant access if the group class
permission bits don't contain all requested permissions. */
if (((mode >> 3) & mask & S_IRWXO) != mask)
goto check_groups;
if (ei->i_acl == EXT2_ACL_NOT_CACHED) {
struct posix_acl *acl;
if (lock) {
down(&inode->i_sem);
acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
up(&inode->i_sem);
} else
acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl) {
int error = posix_acl_permission(inode, acl, mask);
posix_acl_release(acl);
if (ei->i_acl == EXT2_ACL_NOT_CACHED)
return -EIO;
}
if (ei->i_acl) {
int error = posix_acl_permission(inode, ei->i_acl,mask);
if (error == -EACCES)
goto check_capabilities;
return error;
......@@ -319,33 +332,11 @@ __ext2_permission(struct inode *inode, int mask, int lock)
return -EACCES;
}
/*
* Inode operation permission().
*
* inode->i_sem: up
* BKL held [before 2.5.x]
*/
int
ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
{
return __ext2_permission(inode, mask, 1);
}
/*
* Used internally if i_sem is already down.
*/
int
ext2_permission_locked(struct inode *inode, int mask)
{
return __ext2_permission(inode, mask, 0);
}
/*
* Initialize the ACLs of a new inode. Called from ext2_new_inode.
*
* dir->i_sem: down
* inode->i_sem: up (access to inode is still exclusive)
* BKL held [before 2.5.x]
*/
int
ext2_init_acl(struct inode *inode, struct inode *dir)
......@@ -405,7 +396,6 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
* file mode.
*
* inode->i_sem: down
* BKL held [before 2.5.x]
*/
int
ext2_acl_chmod(struct inode *inode)
......
......@@ -60,7 +60,6 @@ static inline int ext2_acl_count(size_t size)
/* acl.c */
extern int ext2_permission (struct inode *, int, struct nameidata *);
extern int ext2_permission_locked (struct inode *, int);
extern int ext2_acl_chmod (struct inode *);
extern int ext2_init_acl (struct inode *, struct inode *);
......
......@@ -41,6 +41,16 @@ struct ext2_inode_info {
__u32 i_prealloc_block;
__u32 i_prealloc_count;
__u32 i_dir_start_lookup;
#ifdef CONFIG_EXT2_FS_XATTR
/*
* Extended attributes can be read independently of the main file
* data. Taking i_sem even when reading would cause contention
* between readers of EAs and writers of regular file data, so
* instead we synchronize on xattr_sem when reading or changing
* EAs.
*/
struct rw_semaphore xattr_sem;
#endif
#ifdef CONFIG_EXT2_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
......
......@@ -177,6 +177,9 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
SLAB_CTOR_CONSTRUCTOR) {
rwlock_init(&ei->i_meta_lock);
#ifdef CONFIG_EXT2_FS_XATTR
init_rwsem(&ei->xattr_sem);
#endif
inode_init_once(&ei->vfs_inode);
}
}
......
......@@ -42,13 +42,12 @@
*
* Locking strategy
* ----------------
* The VFS already holds the BKL and the inode->i_sem semaphore when any of
* the xattr inode operations are called, so we are guaranteed that only one
* processes accesses extended attributes of an inode at any time.
*
* For writing we also grab the ext2_xattr_sem semaphore. This ensures that
* only a single process is modifying an extended attribute block, even
* if the block is shared among inodes.
* EXT2_I(inode)->i_file_acl is protected by EXT2_I(inode)->xattr_sem.
* EA blocks are only changed if they are exclusive to an inode, so
* holding xattr_sem also means that nothing but the EA block's reference
* count will change. Multiple writers to an EA block are synchronized
* by the bh lock. No more than a single bh lock is held at any time
* to avoid deadlocks.
*/
#include <linux/buffer_head.h>
......@@ -57,7 +56,7 @@
#include <linux/slab.h>
#include <linux/mbcache.h>
#include <linux/quotaops.h>
#include <asm/semaphore.h>
#include <linux/rwsem.h>
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
......@@ -105,15 +104,6 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *,
struct ext2_xattr_entry *);
static struct mb_cache *ext2_xattr_cache;
/*
* If a file system does not share extended attributes among inodes,
* we should not need the ext2_xattr_sem semaphore. However, the
* filesystem may still contain shared blocks, so we always take
* the lock.
*/
static DECLARE_MUTEX(ext2_xattr_sem);
static struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX];
static rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED;
......@@ -196,7 +186,7 @@ ext2_xattr_handler(int name_index)
/*
* Inode operation getxattr()
*
* dentry->d_inode->i_sem down
* dentry->d_inode->i_sem: don't care
*/
ssize_t
ext2_getxattr(struct dentry *dentry, const char *name,
......@@ -204,39 +194,28 @@ ext2_getxattr(struct dentry *dentry, const char *name,
{
struct ext2_xattr_handler *handler;
struct inode *inode = dentry->d_inode;
ssize_t error;
handler = ext2_xattr_resolve_name(&name);
if (!handler)
return -EOPNOTSUPP;
down(&inode->i_sem);
error = handler->get(inode, name, buffer, size);
up(&inode->i_sem);
return error;
return handler->get(inode, name, buffer, size);
}
/*
* Inode operation listxattr()
*
* dentry->d_inode->i_sem down
* dentry->d_inode->i_sem: don't care
*/
ssize_t
ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
ssize_t error;
down(&dentry->d_inode->i_sem);
error = ext2_xattr_list(dentry->d_inode, buffer, size);
up(&dentry->d_inode->i_sem);
return error;
return ext2_xattr_list(dentry->d_inode, buffer, size);
}
/*
* Inode operation setxattr()
*
* dentry->d_inode->i_sem down
* dentry->d_inode->i_sem: down
*/
int
ext2_setxattr(struct dentry *dentry, const char *name,
......@@ -256,7 +235,7 @@ ext2_setxattr(struct dentry *dentry, const char *name,
/*
* Inode operation removexattr()
*
* dentry->d_inode->i_sem down
* dentry->d_inode->i_sem: down
*/
int
ext2_removexattr(struct dentry *dentry, const char *name)
......@@ -295,12 +274,15 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name,
if (name == NULL)
return -EINVAL;
down_read(&EXT2_I(inode)->xattr_sem);
error = -ENODATA;
if (!EXT2_I(inode)->i_file_acl)
return -ENODATA;
goto cleanup;
ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl);
bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
error = -EIO;
if (!bh)
return -EIO;
goto cleanup;
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
end = bh->b_data + bh->b_size;
......@@ -365,6 +347,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get",
cleanup:
brelse(bh);
up_read(&EXT2_I(inode)->xattr_sem);
return error;
}
......@@ -391,12 +374,15 @@ ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
ea_idebug(inode, "buffer=%p, buffer_size=%ld",
buffer, (long)buffer_size);
down_read(&EXT2_I(inode)->xattr_sem);
error = 0;
if (!EXT2_I(inode)->i_file_acl)
return 0;
goto cleanup;
ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl);
bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
error = -EIO;
if (!bh)
return -EIO;
goto cleanup;
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
end = bh->b_data + bh->b_size;
......@@ -449,6 +435,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
cleanup:
brelse(bh);
up_read(&EXT2_I(inode)->xattr_sem);
return error;
}
......@@ -520,8 +507,7 @@ ext2_xattr_set(struct inode *inode, int name_index, const char *name,
name_len = strlen(name);
if (name_len > 255 || value_len > sb->s_blocksize)
return -ERANGE;
down(&ext2_xattr_sem);
down_write(&EXT2_I(inode)->xattr_sem);
if (EXT2_I(inode)->i_file_acl) {
/* The inode already has an extended attribute block. */
bh = sb_bread(sb, EXT2_I(inode)->i_file_acl);
......@@ -614,12 +600,16 @@ bad_block: ext2_error(sb, "ext2_xattr_set",
/* Here we know that we can set the new attribute. */
if (header) {
/* assert(header == HDR(bh)); */
lock_buffer(bh);
if (header->h_refcount == cpu_to_le32(1)) {
ea_bdebug(bh, "modifying in-place");
ext2_xattr_cache_remove(bh);
/* keep the buffer locked while modifying it. */
} else {
int offset;
unlock_buffer(bh);
ea_bdebug(bh, "cloning");
header = kmalloc(bh->b_size, GFP_KERNEL);
error = -ENOMEM;
......@@ -644,6 +634,8 @@ bad_block: ext2_error(sb, "ext2_xattr_set",
last = here = ENTRY(header+1);
}
/* Iff we are modifying the block in-place, bh is locked here. */
if (not_found) {
/* Insert the new name. */
size_t size = EXT2_XATTR_LEN(name_len);
......@@ -714,9 +706,13 @@ bad_block: ext2_error(sb, "ext2_xattr_set",
skip_replace:
if (IS_LAST_ENTRY(ENTRY(header+1))) {
/* This block is now empty. */
if (bh && header == HDR(bh))
unlock_buffer(bh); /* we were modifying in-place. */
error = ext2_xattr_set2(inode, bh, NULL);
} else {
ext2_xattr_rehash(header, here);
if (bh && header == HDR(bh))
unlock_buffer(bh); /* we were modifying in-place. */
error = ext2_xattr_set2(inode, bh, header);
}
......@@ -724,7 +720,7 @@ bad_block: ext2_error(sb, "ext2_xattr_set",
brelse(bh);
if (!(bh && header == HDR(bh)))
kfree(header);
up(&ext2_xattr_sem);
up_write(&EXT2_I(inode)->xattr_sem);
return error;
}
......@@ -744,24 +740,28 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
new_bh = ext2_xattr_cache_find(inode, header);
if (new_bh) {
/*
* We found an identical block in the cache.
* The old block will be released after updating
* the inode.
* We found an identical block in the cache. The
* block returned is locked. The old block will
* be released after updating the inode.
*/
ea_bdebug(new_bh, "%s block %lu",
(old_bh == new_bh) ? "keeping" : "reusing",
(unsigned long) new_bh->b_blocknr);
error = -EDQUOT;
if (DQUOT_ALLOC_BLOCK(inode, 1))
if (DQUOT_ALLOC_BLOCK(inode, 1)) {
unlock_buffer(new_bh);
goto cleanup;
}
HDR(new_bh)->h_refcount = cpu_to_le32(
le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
ea_bdebug(new_bh, "refcount now=%d",
le32_to_cpu(HDR(new_bh)->h_refcount));
unlock_buffer(new_bh);
} else if (old_bh && header == HDR(old_bh)) {
/* Keep this block. */
/* Keep this block. No need to lock the block as we
don't need to change the reference count. */
new_bh = old_bh;
get_bh(new_bh);
ext2_xattr_cache_insert(new_bh);
......@@ -812,12 +812,11 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
error = 0;
if (old_bh && old_bh != new_bh) {
/*
* If there was an old block, and we are not still using it,
* we now release the old block.
* If there was an old block and we are no longer using it,
* release the old block.
*/
unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
if (refcount == 1) {
lock_buffer(old_bh);
if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
/* Free the old block. */
ea_bdebug(old_bh, "freeing");
ext2_free_blocks(inode, old_bh->b_blocknr, 1);
......@@ -827,12 +826,14 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
bforget(old_bh);
} else {
/* Decrement the refcount only. */
refcount--;
HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
HDR(old_bh)->h_refcount = cpu_to_le32(
le32_to_cpu(HDR(old_bh)->h_refcount) - 1);
DQUOT_FREE_BLOCK(inode, 1);
mark_buffer_dirty(old_bh);
ea_bdebug(old_bh, "refcount now=%d", refcount);
ea_bdebug(old_bh, "refcount now=%d",
le32_to_cpu(HDR(old_bh)->h_refcount));
}
unlock_buffer(old_bh);
}
cleanup:
......@@ -850,12 +851,11 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
void
ext2_xattr_delete_inode(struct inode *inode)
{
struct buffer_head *bh;
struct buffer_head *bh = NULL;
down_write(&EXT2_I(inode)->xattr_sem);
if (!EXT2_I(inode)->i_file_acl)
return;
down(&ext2_xattr_sem);
goto cleanup;
bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
if (!bh) {
ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
......@@ -871,7 +871,7 @@ ext2_xattr_delete_inode(struct inode *inode)
EXT2_I(inode)->i_file_acl);
goto cleanup;
}
ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
lock_buffer(bh);
if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
ext2_xattr_cache_remove(bh);
ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
......@@ -885,11 +885,13 @@ ext2_xattr_delete_inode(struct inode *inode)
sync_dirty_buffer(bh);
DQUOT_FREE_BLOCK(inode, 1);
}
ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
unlock_buffer(bh);
EXT2_I(inode)->i_file_acl = 0;
cleanup:
brelse(bh);
up(&ext2_xattr_sem);
up_write(&EXT2_I(inode)->xattr_sem);
}
/*
......@@ -982,8 +984,8 @@ ext2_xattr_cmp(struct ext2_xattr_header *header1,
*
* Find an identical extended attribute block.
*
* Returns a pointer to the block found, or NULL if such a block was
* not found or an error occurred.
* Returns a locked buffer head to the block found, or NULL if such
* a block was not found or an error occurred.
*/
static struct buffer_head *
ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
......@@ -1003,18 +1005,23 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
ext2_error(inode->i_sb, "ext2_xattr_cache_find",
"inode %ld: block %ld read error",
inode->i_ino, (unsigned long) ce->e_block);
} else if (le32_to_cpu(HDR(bh)->h_refcount) >
} else {
lock_buffer(bh);
if (le32_to_cpu(HDR(bh)->h_refcount) >
EXT2_XATTR_REFCOUNT_MAX) {
ea_idebug(inode, "block %ld refcount %d>%d",
(unsigned long) ce->e_block,
le32_to_cpu(HDR(bh)->h_refcount),
EXT2_XATTR_REFCOUNT_MAX);
} else if (!ext2_xattr_cmp(header, HDR(bh))) {
ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
ea_bdebug(bh, "b_count=%d",
atomic_read(&(bh->b_count)));
mb_cache_entry_release(ce);
return bh;
}
unlock_buffer(bh);
brelse(bh);
}
ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
}
return NULL;
......
......@@ -11,10 +11,6 @@
#include "ext2.h"
#include "xattr.h"
#ifdef CONFIG_EXT2_FS_POSIX_ACL
# include "acl.h"
#endif
#define XATTR_USER_PREFIX "user."
static size_t
......@@ -44,11 +40,7 @@ ext2_xattr_user_get(struct inode *inode, const char *name,
return -EINVAL;
if (!test_opt(inode->i_sb, XATTR_USER))
return -EOPNOTSUPP;
#ifdef CONFIG_EXT2_FS_POSIX_ACL
error = ext2_permission_locked(inode, MAY_READ);
#else
error = permission(inode, MAY_READ, NULL);
#endif
if (error)
return error;
......@@ -68,11 +60,7 @@ ext2_xattr_user_set(struct inode *inode, const char *name,
if ( !S_ISREG(inode->i_mode) &&
(!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
return -EPERM;
#ifdef CONFIG_EXT2_FS_POSIX_ACL
error = ext2_permission_locked(inode, MAY_WRITE);
#else
error = permission(inode, MAY_WRITE, NULL);
#endif
if (error)
return error;
......
......@@ -125,10 +125,34 @@ ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
return ERR_PTR(-EINVAL);
}
static inline struct posix_acl *
ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
{
struct posix_acl *acl = EXT3_ACL_NOT_CACHED;
spin_lock(&inode->i_lock);
if (*i_acl != EXT3_ACL_NOT_CACHED)
acl = posix_acl_dup(*i_acl);
spin_unlock(&inode->i_lock);
return acl;
}
static inline void
ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
struct posix_acl *acl)
{
spin_lock(&inode->i_lock);
if (*i_acl != EXT3_ACL_NOT_CACHED)
posix_acl_release(*i_acl);
*i_acl = posix_acl_dup(acl);
spin_unlock(&inode->i_lock);
}
/*
* Inode operation get_posix_acl().
*
* inode->i_sem: down
* inode->i_sem: don't care
*/
static struct posix_acl *
ext3_get_acl(struct inode *inode, int type)
......@@ -145,14 +169,16 @@ ext3_get_acl(struct inode *inode, int type)
switch(type) {
case ACL_TYPE_ACCESS:
if (ei->i_acl != EXT3_ACL_NOT_CACHED)
return posix_acl_dup(ei->i_acl);
acl = ext3_iget_acl(inode, &ei->i_acl);
if (acl != EXT3_ACL_NOT_CACHED)
return acl;
name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
if (ei->i_default_acl != EXT3_ACL_NOT_CACHED)
return posix_acl_dup(ei->i_default_acl);
acl = ext3_iget_acl(inode, &ei->i_default_acl);
if (acl != EXT3_ACL_NOT_CACHED)
return acl;
name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
break;
......@@ -174,11 +200,11 @@ ext3_get_acl(struct inode *inode, int type)
if (!IS_ERR(acl)) {
switch(type) {
case ACL_TYPE_ACCESS:
ei->i_acl = posix_acl_dup(acl);
ext3_iset_acl(inode, &ei->i_acl, acl);
break;
case ACL_TYPE_DEFAULT:
ei->i_default_acl = posix_acl_dup(acl);
ext3_iset_acl(inode, &ei->i_default_acl, acl);
break;
}
}
......@@ -245,23 +271,24 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
if (!error) {
switch(type) {
case ACL_TYPE_ACCESS:
if (ei->i_acl != EXT3_ACL_NOT_CACHED)
posix_acl_release(ei->i_acl);
ei->i_acl = posix_acl_dup(acl);
ext3_iset_acl(inode, &ei->i_acl, acl);
break;
case ACL_TYPE_DEFAULT:
if (ei->i_default_acl != EXT3_ACL_NOT_CACHED)
posix_acl_release(ei->i_default_acl);
ei->i_default_acl = posix_acl_dup(acl);
ext3_iset_acl(inode, &ei->i_default_acl, acl);
break;
}
}
return error;
}
static int
__ext3_permission(struct inode *inode, int mask, int lock)
/*
* Inode operation permission().
*
* inode->i_sem: don't care
*/
int
ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
{
int mode = inode->i_mode;
......@@ -275,30 +302,16 @@ __ext3_permission(struct inode *inode, int mask, int lock)
if (current->fsuid == inode->i_uid) {
mode >>= 6;
} else if (test_opt(inode->i_sb, POSIX_ACL)) {
struct ext3_inode_info *ei = EXT3_I(inode);
struct posix_acl *acl;
/* The access ACL cannot grant access if the group class
permission bits don't contain all requested permissions. */
if (((mode >> 3) & mask & S_IRWXO) != mask)
goto check_groups;
if (ei->i_acl == EXT3_ACL_NOT_CACHED) {
struct posix_acl *acl;
if (lock) {
down(&inode->i_sem);
acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
up(&inode->i_sem);
} else
acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl) {
int error = posix_acl_permission(inode, acl, mask);
posix_acl_release(acl);
if (ei->i_acl == EXT3_ACL_NOT_CACHED)
return -EIO;
}
if (ei->i_acl) {
int error = posix_acl_permission(inode, ei->i_acl,mask);
if (error == -EACCES)
goto check_capabilities;
return error;
......@@ -324,26 +337,6 @@ __ext3_permission(struct inode *inode, int mask, int lock)
return -EACCES;
}
/*
* Inode operation permission().
*
* inode->i_sem: up
*/
int
ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
{
return __ext3_permission(inode, mask, 1);
}
/*
* Used internally if i_sem is already down.
*/
int
ext3_permission_locked(struct inode *inode, int mask)
{
return __ext3_permission(inode, mask, 0);
}
/*
* Initialize the ACLs of a new inode. Called from ext3_new_inode.
*
......
......@@ -60,7 +60,6 @@ static inline int ext3_acl_count(size_t size)
/* acl.c */
extern int ext3_permission (struct inode *, int, struct nameidata *);
extern int ext3_permission_locked (struct inode *, int);
extern int ext3_acl_chmod (struct inode *);
extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
......
......@@ -519,6 +519,9 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
SLAB_CTOR_CONSTRUCTOR) {
INIT_LIST_HEAD(&ei->i_orphan);
#ifdef CONFIG_EXT3_FS_XATTR
init_rwsem(&ei->xattr_sem);
#endif
init_rwsem(&ei->truncate_sem);
inode_init_once(&ei->vfs_inode);
}
......
......@@ -43,13 +43,12 @@
*
* Locking strategy
* ----------------
* The VFS holdsinode->i_sem semaphore when any of the xattr inode
* operations are called, so we are guaranteed that only one
* processes accesses extended attributes of an inode at any time.
*
* For writing we also grab the ext3_xattr_sem semaphore. This ensures that
* only a single process is modifying an extended attribute block, even
* if the block is shared among inodes.
* EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem.
* EA blocks are only changed if they are exclusive to an inode, so
* holding xattr_sem also means that nothing but the EA block's reference
* count will change. Multiple writers to an EA block are synchronized
* by the bh lock. No more than a single bh lock is held at any time
* to avoid deadlocks.
*/
#include <linux/init.h>
......@@ -59,7 +58,7 @@
#include <linux/ext3_fs.h>
#include <linux/mbcache.h>
#include <linux/quotaops.h>
#include <asm/semaphore.h>
#include <linux/rwsem.h>
#include "xattr.h"
#include "acl.h"
......@@ -93,22 +92,14 @@ static int ext3_xattr_set_handle2(handle_t *, struct inode *,
struct ext3_xattr_header *);
static int ext3_xattr_cache_insert(struct buffer_head *);
static struct buffer_head *ext3_xattr_cache_find(struct inode *,
struct ext3_xattr_header *);
static struct buffer_head *ext3_xattr_cache_find(handle_t *, struct inode *,
struct ext3_xattr_header *,
int *);
static void ext3_xattr_cache_remove(struct buffer_head *);
static void ext3_xattr_rehash(struct ext3_xattr_header *,
struct ext3_xattr_entry *);
static struct mb_cache *ext3_xattr_cache;
/*
* If a file system does not share extended attributes among inodes,
* we should not need the ext3_xattr_sem semaphore. However, the
* filesystem may still contain shared blocks, so we always take
* the lock.
*/
static DECLARE_MUTEX(ext3_xattr_sem);
static struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX];
static rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED;
......@@ -191,7 +182,7 @@ ext3_xattr_handler(int name_index)
/*
* Inode operation getxattr()
*
* dentry->d_inode->i_sem down
* dentry->d_inode->i_sem: don't care
*/
ssize_t
ext3_getxattr(struct dentry *dentry, const char *name,
......@@ -199,39 +190,28 @@ ext3_getxattr(struct dentry *dentry, const char *name,
{
struct ext3_xattr_handler *handler;
struct inode *inode = dentry->d_inode;
ssize_t error;
handler = ext3_xattr_resolve_name(&name);
if (!handler)
return -EOPNOTSUPP;
down(&inode->i_sem);
error = handler->get(inode, name, buffer, size);
up(&inode->i_sem);
return error;
return handler->get(inode, name, buffer, size);
}
/*
* Inode operation listxattr()
*
* dentry->d_inode->i_sem down
* dentry->d_inode->i_sem: don't care
*/
ssize_t
ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
ssize_t error;
down(&dentry->d_inode->i_sem);
error = ext3_xattr_list(dentry->d_inode, buffer, size);
up(&dentry->d_inode->i_sem);
return error;
return ext3_xattr_list(dentry->d_inode, buffer, size);
}
/*
* Inode operation setxattr()
*
* dentry->d_inode->i_sem down
* dentry->d_inode->i_sem: down
*/
int
ext3_setxattr(struct dentry *dentry, const char *name,
......@@ -251,7 +231,7 @@ ext3_setxattr(struct dentry *dentry, const char *name,
/*
* Inode operation removexattr()
*
* dentry->d_inode->i_sem down
* dentry->d_inode->i_sem: down
*/
int
ext3_removexattr(struct dentry *dentry, const char *name)
......@@ -290,12 +270,15 @@ ext3_xattr_get(struct inode *inode, int name_index, const char *name,
if (name == NULL)
return -EINVAL;
down_read(&EXT3_I(inode)->xattr_sem);
error = -ENODATA;
if (!EXT3_I(inode)->i_file_acl)
return -ENODATA;
goto cleanup;
ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
error = -EIO;
if (!bh)
return -EIO;
goto cleanup;
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
end = bh->b_data + bh->b_size;
......@@ -360,6 +343,7 @@ bad_block: ext3_error(inode->i_sb, "ext3_xattr_get",
cleanup:
brelse(bh);
up_read(&EXT3_I(inode)->xattr_sem);
return error;
}
......@@ -386,12 +370,15 @@ ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
ea_idebug(inode, "buffer=%p, buffer_size=%ld",
buffer, (long)buffer_size);
down_read(&EXT3_I(inode)->xattr_sem);
error = 0;
if (!EXT3_I(inode)->i_file_acl)
return 0;
goto cleanup;
ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
error = -EIO;
if (!bh)
return -EIO;
goto cleanup;
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
end = bh->b_data + bh->b_size;
......@@ -444,6 +431,7 @@ bad_block: ext3_error(inode->i_sb, "ext3_xattr_list",
cleanup:
brelse(bh);
up_read(&EXT3_I(inode)->xattr_sem);
return error;
}
......@@ -459,11 +447,12 @@ static void ext3_xattr_update_super_block(handle_t *handle,
return;
lock_super(sb);
ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
EXT3_SB(sb)->s_es->s_feature_compat |=
cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
sb->s_dirt = 1;
ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
}
unlock_super(sb);
}
......@@ -518,8 +507,7 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
name_len = strlen(name);
if (name_len > 255 || value_len > sb->s_blocksize)
return -ERANGE;
down(&ext3_xattr_sem);
down_write(&EXT3_I(inode)->xattr_sem);
if (EXT3_I(inode)->i_file_acl) {
/* The inode already has an extended attribute block. */
bh = sb_bread(sb, EXT3_I(inode)->i_file_acl);
......@@ -612,15 +600,28 @@ bad_block: ext3_error(sb, "ext3_xattr_set",
/* Here we know that we can set the new attribute. */
if (header) {
int credits = 0;
/* assert(header == HDR(bh)); */
if (header->h_refcount != cpu_to_le32(1))
goto skip_get_write_access;
/* ext3_journal_get_write_access() requires an unlocked bh,
which complicates things here. */
error = ext3_journal_get_write_access_credits(handle, bh,
&credits);
if (error)
goto cleanup;
lock_buffer(bh);
if (header->h_refcount == cpu_to_le32(1)) {
ea_bdebug(bh, "modifying in-place");
ext3_xattr_cache_remove(bh);
error = ext3_journal_get_write_access(handle, bh);
if (error)
goto cleanup;
/* keep the buffer locked while modifying it. */
} else {
int offset;
unlock_buffer(bh);
journal_release_buffer(handle, bh, credits);
skip_get_write_access:
ea_bdebug(bh, "cloning");
header = kmalloc(bh->b_size, GFP_KERNEL);
error = -ENOMEM;
......@@ -645,6 +646,8 @@ bad_block: ext3_error(sb, "ext3_xattr_set",
last = here = ENTRY(header+1);
}
/* Iff we are modifying the block in-place, bh is locked here. */
if (not_found) {
/* Insert the new name. */
size_t size = EXT3_XATTR_LEN(name_len);
......@@ -715,9 +718,13 @@ bad_block: ext3_error(sb, "ext3_xattr_set",
skip_replace:
if (IS_LAST_ENTRY(ENTRY(header+1))) {
/* This block is now empty. */
if (bh && header == HDR(bh))
unlock_buffer(bh); /* we were modifying in-place. */
error = ext3_xattr_set_handle2(handle, inode, bh, NULL);
} else {
ext3_xattr_rehash(header, here);
if (bh && header == HDR(bh))
unlock_buffer(bh); /* we were modifying in-place. */
error = ext3_xattr_set_handle2(handle, inode, bh, header);
}
......@@ -725,7 +732,7 @@ bad_block: ext3_error(sb, "ext3_xattr_set",
brelse(bh);
if (!(bh && header == HDR(bh)))
kfree(header);
up(&ext3_xattr_sem);
up_write(&EXT3_I(inode)->xattr_sem);
return error;
}
......@@ -740,33 +747,34 @@ ext3_xattr_set_handle2(handle_t *handle, struct inode *inode,
{
struct super_block *sb = inode->i_sb;
struct buffer_head *new_bh = NULL;
int error;
int credits = 0, error;
if (header) {
new_bh = ext3_xattr_cache_find(inode, header);
new_bh = ext3_xattr_cache_find(handle, inode, header, &credits);
if (new_bh) {
/*
* We found an identical block in the cache.
* The old block will be released after updating
* the inode.
* We found an identical block in the cache. The
* block returned is locked. The old block will
* be released after updating the inode.
*/
ea_bdebug(new_bh, "%s block %lu",
(old_bh == new_bh) ? "keeping" : "reusing",
(unsigned long) new_bh->b_blocknr);
error = -EDQUOT;
if (DQUOT_ALLOC_BLOCK(inode, 1))
goto cleanup;
error = ext3_journal_get_write_access(handle, new_bh);
if (error)
if (DQUOT_ALLOC_BLOCK(inode, 1)) {
unlock_buffer(new_bh);
journal_release_buffer(handle, new_bh, credits);
goto cleanup;
}
HDR(new_bh)->h_refcount = cpu_to_le32(
le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
ea_bdebug(new_bh, "refcount now=%d",
le32_to_cpu(HDR(new_bh)->h_refcount));
unlock_buffer(new_bh);
} else if (old_bh && header == HDR(old_bh)) {
/* Keep this block. */
/* Keep this block. No need to lock the block as we
* don't need to change the reference count. */
new_bh = old_bh;
get_bh(new_bh);
ext3_xattr_cache_insert(new_bh);
......@@ -817,15 +825,14 @@ ext3_xattr_set_handle2(handle_t *handle, struct inode *inode,
error = 0;
if (old_bh && old_bh != new_bh) {
/*
* If there was an old block, and we are not still using it,
* we now release the old block.
* If there was an old block, and we are no longer using it,
* release the old block.
*/
unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
error = ext3_journal_get_write_access(handle, old_bh);
if (error)
goto cleanup;
if (refcount == 1) {
lock_buffer(old_bh);
if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
/* Free the old block. */
ea_bdebug(old_bh, "freeing");
ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1);
......@@ -837,12 +844,14 @@ ext3_xattr_set_handle2(handle_t *handle, struct inode *inode,
ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr);
} else {
/* Decrement the refcount only. */
refcount--;
HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
HDR(old_bh)->h_refcount = cpu_to_le32(
le32_to_cpu(HDR(old_bh)->h_refcount) - 1);
DQUOT_FREE_BLOCK(inode, 1);
ext3_journal_dirty_metadata(handle, old_bh);
ea_bdebug(old_bh, "refcount now=%d", refcount);
ea_bdebug(old_bh, "refcount now=%d",
le32_to_cpu(HDR(old_bh)->h_refcount));
}
unlock_buffer(old_bh);
}
cleanup:
......@@ -886,12 +895,11 @@ ext3_xattr_set(struct inode *inode, int name_index, const char *name,
void
ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
{
struct buffer_head *bh;
struct buffer_head *bh = NULL;
down_write(&EXT3_I(inode)->xattr_sem);
if (!EXT3_I(inode)->i_file_acl)
return;
down(&ext3_xattr_sem);
goto cleanup;
bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
if (!bh) {
ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
......@@ -899,7 +907,6 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
EXT3_I(inode)->i_file_acl);
goto cleanup;
}
ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
HDR(bh)->h_blocks != cpu_to_le32(1)) {
ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
......@@ -907,8 +914,9 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
EXT3_I(inode)->i_file_acl);
goto cleanup;
}
ext3_journal_get_write_access(handle, bh);
ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
if (ext3_journal_get_write_access(handle, bh) != 0)
goto cleanup;
lock_buffer(bh);
if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
ext3_xattr_cache_remove(bh);
ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1);
......@@ -922,11 +930,13 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
handle->h_sync = 1;
DQUOT_FREE_BLOCK(inode, 1);
}
ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
unlock_buffer(bh);
EXT3_I(inode)->i_file_acl = 0;
cleanup:
brelse(bh);
up(&ext3_xattr_sem);
up_write(&EXT3_I(inode)->xattr_sem);
}
/*
......@@ -1022,7 +1032,8 @@ ext3_xattr_cmp(struct ext3_xattr_header *header1,
* not found or an error occurred.
*/
static struct buffer_head *
ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header)
ext3_xattr_cache_find(handle_t *handle, struct inode *inode,
struct ext3_xattr_header *header, int *credits)
{
__u32 hash = le32_to_cpu(header->h_hash);
struct mb_cache_entry *ce;
......@@ -1039,18 +1050,28 @@ ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header)
ext3_error(inode->i_sb, "ext3_xattr_cache_find",
"inode %ld: block %ld read error",
inode->i_ino, (unsigned long) ce->e_block);
} else if (le32_to_cpu(HDR(bh)->h_refcount) >
} else {
/* ext3_journal_get_write_access() requires an unlocked
* bh, which complicates things here. */
if (ext3_journal_get_write_access_credits(handle, bh,
credits) != 0)
return NULL;
lock_buffer(bh);
if (le32_to_cpu(HDR(bh)->h_refcount) >
EXT3_XATTR_REFCOUNT_MAX) {
ea_idebug(inode, "block %ld refcount %d>%d",
(unsigned long) ce->e_block,
le32_to_cpu(HDR(bh)->h_refcount),
EXT3_XATTR_REFCOUNT_MAX);
} else if (!ext3_xattr_cmp(header, HDR(bh))) {
ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
mb_cache_entry_release(ce);
/* buffer will be unlocked by caller */
return bh;
}
unlock_buffer(bh);
journal_release_buffer(handle, bh, *credits);
brelse(bh);
}
ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
}
return NULL;
......
......@@ -13,10 +13,6 @@
#include <linux/ext3_fs.h>
#include "xattr.h"
#ifdef CONFIG_EXT3_FS_POSIX_ACL
# include "acl.h"
#endif
#define XATTR_USER_PREFIX "user."
static size_t
......@@ -46,11 +42,7 @@ ext3_xattr_user_get(struct inode *inode, const char *name,
return -EINVAL;
if (!test_opt(inode->i_sb, XATTR_USER))
return -EOPNOTSUPP;
#ifdef CONFIG_EXT3_FS_POSIX_ACL
error = ext3_permission_locked(inode, MAY_READ);
#else
error = permission(inode, MAY_READ, NULL);
#endif
if (error)
return error;
......@@ -70,11 +62,7 @@ ext3_xattr_user_set(struct inode *inode, const char *name,
if ( !S_ISREG(inode->i_mode) &&
(!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
return -EPERM;
#ifdef CONFIG_EXT3_FS_POSIX_ACL
error = ext3_permission_locked(inode, MAY_WRITE);
#else
error = permission(inode, MAY_WRITE, NULL);
#endif
if (error)
return error;
......
......@@ -62,6 +62,16 @@ struct ext3_inode_info {
__u32 i_prealloc_count;
#endif
__u32 i_dir_start_lookup;
#ifdef CONFIG_EXT3_FS_XATTR
/*
* Extended attributes can be read independently of the main file
* data. Taking i_sem even when reading would cause contention
* between readers of EAs and writers of regular file data, so
* instead we synchronize on xattr_sem when reading or changing
* EAs.
*/
struct rw_semaphore xattr_sem;
#endif
#ifdef CONFIG_EXT3_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment