Commit 6abc05cc authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] xattr: fine-grained locking

From: Andreas Gruenbacher <agruen@suse.de>

This patch removes the dependency on i_sem in the getxattr and
listxattr iops of ext2 and ext3. In addition, the global ext[23]_xattr
semaphores go away. Instead of i_sem and the global semaphore, mutual
exclusion is now ensured by per-inode xattr semaphores, and by locking
the buffers before modifying them. The detailed locking strategy is
described in comments in fs/ext[23]/xattr.c.

Due to this change it is no longer necessary to take i_sem in
ext[23]_permission() for retrieving acls, so the
ext[23]_permission_locked() functions go away.

Additionally, the patch fixes a race condition in ext[23]_permission:
Accessing inode->i_acl was protected by the BKL in 2.4; in 2.5 there no
longer is such protection. Instead, inode->i_acl (and inode->i_default_acl)
are now accessed under inode->i_lock. (This could be replaced by RCU in
the future.)

In the ext3 extended attribute code, an new uglines results from locking
at the buffer head level: The buffer lock must be held between testing
if an xattr block can be modified and the actual modification to prevent
races from happening. Before a block can be modified,
ext3_journal_get_write_access() must be called. But this requies an unlocked
buffer, so I call ext3_journal_get_write_access() before locking the
buffer. If it turns out that the buffer cannot be modified,
journal_release_buffer() is called. Calling ext3_journal_get_write_access
after the test but while the buffer is still locked would be much better.
parent 430cab6d
......@@ -124,14 +124,38 @@ ext2_acl_to_disk(const struct posix_acl *acl, size_t *size)
return ERR_PTR(-EINVAL);
}
static inline struct posix_acl *
ext2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
{
struct posix_acl *acl = EXT2_ACL_NOT_CACHED;
spin_lock(&inode->i_lock);
if (*i_acl != EXT2_ACL_NOT_CACHED)
acl = posix_acl_dup(*i_acl);
spin_unlock(&inode->i_lock);
return acl;
}
static inline void
ext2_iset_acl(struct inode *inode, struct posix_acl **i_acl,
struct posix_acl *acl)
{
spin_lock(&inode->i_lock);
if (*i_acl != EXT2_ACL_NOT_CACHED)
posix_acl_release(*i_acl);
*i_acl = posix_acl_dup(acl);
spin_unlock(&inode->i_lock);
}
/*
* inode->i_sem: down
* inode->i_sem: don't care
*/
static struct posix_acl *
ext2_get_acl(struct inode *inode, int type)
{
const size_t max_size = ext2_acl_size(EXT2_ACL_MAX_ENTRIES);
struct ext2_inode_inode *ei = EXT2_I(inode);
struct ext2_inode_info *ei = EXT2_I(inode);
int name_index;
char *value;
struct posix_acl *acl;
......@@ -142,14 +166,16 @@ ext2_get_acl(struct inode *inode, int type)
switch(type) {
case ACL_TYPE_ACCESS:
if (ei->i_acl != EXT2_ACL_NOT_CACHED)
return posix_acl_dup(ei->i_acl);
acl = ext2_iget_acl(inode, &ei->i_acl);
if (acl != EXT2_ACL_NOT_CACHED)
return acl;
name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
if (ei->i_default_acl != EXT2_ACL_NOT_CACHED)
return posix_acl_dup(ei->i_default_acl);
acl = ext2_iget_acl(inode, &ei->i_default_acl);
if (acl != EXT2_ACL_NOT_CACHED)
return acl;
name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
break;
......@@ -171,11 +197,11 @@ ext2_get_acl(struct inode *inode, int type)
if (!IS_ERR(acl)) {
switch(type) {
case ACL_TYPE_ACCESS:
ei->i_acl = posix_acl_dup(acl);
ext2_iset_acl(inode, &ei->i_acl, acl);
break;
case ACL_TYPE_DEFAULT:
ei->i_default_acl = posix_acl_dup(acl);
ext2_iset_acl(inode, &ei->i_default_acl, acl);
break;
}
}
......@@ -240,23 +266,24 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
if (!error) {
switch(type) {
case ACL_TYPE_ACCESS:
if (ei->i_acl != EXT2_ACL_NOT_CACHED)
posix_acl_release(ei->i_acl);
ei->i_acl = posix_acl_dup(acl);
ext2_iset_acl(inode, &ei->i_acl, acl);
break;
case ACL_TYPE_DEFAULT:
if (ei->i_default_acl != EXT2_ACL_NOT_CACHED)
posix_acl_release(ei->i_default_acl);
ei->i_default_acl = posix_acl_dup(acl);
ext2_iset_acl(inode, &ei->i_default_acl, acl);
break;
}
}
return error;
}
static int
__ext2_permission(struct inode *inode, int mask, int lock)
/*
* Inode operation permission().
*
* inode->i_sem: don't care
*/
int
ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
{
int mode = inode->i_mode;
......@@ -270,30 +297,16 @@ __ext2_permission(struct inode *inode, int mask, int lock)
if (current->fsuid == inode->i_uid) {
mode >>= 6;
} else if (test_opt(inode->i_sb, POSIX_ACL)) {
struct ext2_inode_info *ei = EXT2_I(inode);
struct posix_acl *acl;
/* The access ACL cannot grant access if the group class
permission bits don't contain all requested permissions. */
if (((mode >> 3) & mask & S_IRWXO) != mask)
goto check_groups;
if (ei->i_acl == EXT2_ACL_NOT_CACHED) {
struct posix_acl *acl;
if (lock) {
down(&inode->i_sem);
acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
up(&inode->i_sem);
} else
acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl))
return PTR_ERR(acl);
acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
if (acl) {
int error = posix_acl_permission(inode, acl, mask);
posix_acl_release(acl);
if (ei->i_acl == EXT2_ACL_NOT_CACHED)
return -EIO;
}
if (ei->i_acl) {
int error = posix_acl_permission(inode, ei->i_acl,mask);
if (error == -EACCES)
goto check_capabilities;
return error;
......@@ -319,33 +332,11 @@ __ext2_permission(struct inode *inode, int mask, int lock)
return -EACCES;
}
/*
* Inode operation permission().
*
* inode->i_sem: up
* BKL held [before 2.5.x]
*/
int
ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
{
return __ext2_permission(inode, mask, 1);
}
/*
* Used internally if i_sem is already down.
*/
int
ext2_permission_locked(struct inode *inode, int mask)
{
return __ext2_permission(inode, mask, 0);
}
/*
* Initialize the ACLs of a new inode. Called from ext2_new_inode.
*
* dir->i_sem: down
* inode->i_sem: up (access to inode is still exclusive)
* BKL held [before 2.5.x]
*/
int
ext2_init_acl(struct inode *inode, struct inode *dir)
......@@ -405,7 +396,6 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
* file mode.
*
* inode->i_sem: down
* BKL held [before 2.5.x]
*/
int
ext2_acl_chmod(struct inode *inode)
......
......@@ -60,7 +60,6 @@ static inline int ext2_acl_count(size_t size)
/* acl.c */
extern int ext2_permission (struct inode *, int, struct nameidata *);
extern int ext2_permission_locked (struct inode *, int);
extern int ext2_acl_chmod (struct inode *);
extern int ext2_init_acl (struct inode *, struct inode *);
......
......@@ -41,6 +41,16 @@ struct ext2_inode_info {
__u32 i_prealloc_block;
__u32 i_prealloc_count;
__u32 i_dir_start_lookup;
#ifdef CONFIG_EXT2_FS_XATTR
/*
* Extended attributes can be read independently of the main file
* data. Taking i_sem even when reading would cause contention
* between readers of EAs and writers of regular file data, so
* instead we synchronize on xattr_sem when reading or changing
* EAs.
*/
struct rw_semaphore xattr_sem;
#endif
#ifdef CONFIG_EXT2_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
......
......@@ -177,6 +177,9 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
SLAB_CTOR_CONSTRUCTOR) {
rwlock_init(&ei->i_meta_lock);
#ifdef CONFIG_EXT2_FS_XATTR
init_rwsem(&ei->xattr_sem);
#endif
inode_init_once(&ei->vfs_inode);
}
}
......
This diff is collapsed.
......@@ -11,10 +11,6 @@
#include "ext2.h"
#include "xattr.h"
#ifdef CONFIG_EXT2_FS_POSIX_ACL
# include "acl.h"
#endif
#define XATTR_USER_PREFIX "user."
static size_t
......@@ -44,11 +40,7 @@ ext2_xattr_user_get(struct inode *inode, const char *name,
return -EINVAL;
if (!test_opt(inode->i_sb, XATTR_USER))
return -EOPNOTSUPP;
#ifdef CONFIG_EXT2_FS_POSIX_ACL
error = ext2_permission_locked(inode, MAY_READ);
#else
error = permission(inode, MAY_READ, NULL);
#endif
if (error)
return error;
......@@ -68,11 +60,7 @@ ext2_xattr_user_set(struct inode *inode, const char *name,
if ( !S_ISREG(inode->i_mode) &&
(!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
return -EPERM;
#ifdef CONFIG_EXT2_FS_POSIX_ACL
error = ext2_permission_locked(inode, MAY_WRITE);
#else
error = permission(inode, MAY_WRITE, NULL);
#endif
if (error)
return error;
......
......@@ -125,10 +125,34 @@ ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
return ERR_PTR(-EINVAL);
}
static inline struct posix_acl *
ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
{
struct posix_acl *acl = EXT3_ACL_NOT_CACHED;
spin_lock(&inode->i_lock);
if (*i_acl != EXT3_ACL_NOT_CACHED)
acl = posix_acl_dup(*i_acl);
spin_unlock(&inode->i_lock);
return acl;
}
static inline void
ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
struct posix_acl *acl)
{
spin_lock(&inode->i_lock);
if (*i_acl != EXT3_ACL_NOT_CACHED)
posix_acl_release(*i_acl);
*i_acl = posix_acl_dup(acl);
spin_unlock(&inode->i_lock);
}
/*
* Inode operation get_posix_acl().
*
* inode->i_sem: down
* inode->i_sem: don't care
*/
static struct posix_acl *
ext3_get_acl(struct inode *inode, int type)
......@@ -145,14 +169,16 @@ ext3_get_acl(struct inode *inode, int type)
switch(type) {
case ACL_TYPE_ACCESS:
if (ei->i_acl != EXT3_ACL_NOT_CACHED)
return posix_acl_dup(ei->i_acl);
acl = ext3_iget_acl(inode, &ei->i_acl);
if (acl != EXT3_ACL_NOT_CACHED)
return acl;
name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
if (ei->i_default_acl != EXT3_ACL_NOT_CACHED)
return posix_acl_dup(ei->i_default_acl);
acl = ext3_iget_acl(inode, &ei->i_default_acl);
if (acl != EXT3_ACL_NOT_CACHED)
return acl;
name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
break;
......@@ -174,11 +200,11 @@ ext3_get_acl(struct inode *inode, int type)
if (!IS_ERR(acl)) {
switch(type) {
case ACL_TYPE_ACCESS:
ei->i_acl = posix_acl_dup(acl);
ext3_iset_acl(inode, &ei->i_acl, acl);
break;
case ACL_TYPE_DEFAULT:
ei->i_default_acl = posix_acl_dup(acl);
ext3_iset_acl(inode, &ei->i_default_acl, acl);
break;
}
}
......@@ -245,23 +271,24 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
if (!error) {
switch(type) {
case ACL_TYPE_ACCESS:
if (ei->i_acl != EXT3_ACL_NOT_CACHED)
posix_acl_release(ei->i_acl);
ei->i_acl = posix_acl_dup(acl);
ext3_iset_acl(inode, &ei->i_acl, acl);
break;
case ACL_TYPE_DEFAULT:
if (ei->i_default_acl != EXT3_ACL_NOT_CACHED)
posix_acl_release(ei->i_default_acl);
ei->i_default_acl = posix_acl_dup(acl);
ext3_iset_acl(inode, &ei->i_default_acl, acl);
break;
}
}
return error;
}
static int
__ext3_permission(struct inode *inode, int mask, int lock)
/*
* Inode operation permission().
*
* inode->i_sem: don't care
*/
int
ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
{
int mode = inode->i_mode;
......@@ -275,30 +302,16 @@ __ext3_permission(struct inode *inode, int mask, int lock)
if (current->fsuid == inode->i_uid) {
mode >>= 6;
} else if (test_opt(inode->i_sb, POSIX_ACL)) {
struct ext3_inode_info *ei = EXT3_I(inode);
struct posix_acl *acl;
/* The access ACL cannot grant access if the group class
permission bits don't contain all requested permissions. */
if (((mode >> 3) & mask & S_IRWXO) != mask)
goto check_groups;
if (ei->i_acl == EXT3_ACL_NOT_CACHED) {
struct posix_acl *acl;
if (lock) {
down(&inode->i_sem);
acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
up(&inode->i_sem);
} else
acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl))
return PTR_ERR(acl);
acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
if (acl) {
int error = posix_acl_permission(inode, acl, mask);
posix_acl_release(acl);
if (ei->i_acl == EXT3_ACL_NOT_CACHED)
return -EIO;
}
if (ei->i_acl) {
int error = posix_acl_permission(inode, ei->i_acl,mask);
if (error == -EACCES)
goto check_capabilities;
return error;
......@@ -324,26 +337,6 @@ __ext3_permission(struct inode *inode, int mask, int lock)
return -EACCES;
}
/*
* Inode operation permission().
*
* inode->i_sem: up
*/
int
ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
{
return __ext3_permission(inode, mask, 1);
}
/*
* Used internally if i_sem is already down.
*/
int
ext3_permission_locked(struct inode *inode, int mask)
{
return __ext3_permission(inode, mask, 0);
}
/*
* Initialize the ACLs of a new inode. Called from ext3_new_inode.
*
......
......@@ -60,7 +60,6 @@ static inline int ext3_acl_count(size_t size)
/* acl.c */
extern int ext3_permission (struct inode *, int, struct nameidata *);
extern int ext3_permission_locked (struct inode *, int);
extern int ext3_acl_chmod (struct inode *);
extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
......
......@@ -519,6 +519,9 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
SLAB_CTOR_CONSTRUCTOR) {
INIT_LIST_HEAD(&ei->i_orphan);
#ifdef CONFIG_EXT3_FS_XATTR
init_rwsem(&ei->xattr_sem);
#endif
init_rwsem(&ei->truncate_sem);
inode_init_once(&ei->vfs_inode);
}
......
This diff is collapsed.
......@@ -13,10 +13,6 @@
#include <linux/ext3_fs.h>
#include "xattr.h"
#ifdef CONFIG_EXT3_FS_POSIX_ACL
# include "acl.h"
#endif
#define XATTR_USER_PREFIX "user."
static size_t
......@@ -46,11 +42,7 @@ ext3_xattr_user_get(struct inode *inode, const char *name,
return -EINVAL;
if (!test_opt(inode->i_sb, XATTR_USER))
return -EOPNOTSUPP;
#ifdef CONFIG_EXT3_FS_POSIX_ACL
error = ext3_permission_locked(inode, MAY_READ);
#else
error = permission(inode, MAY_READ, NULL);
#endif
if (error)
return error;
......@@ -70,11 +62,7 @@ ext3_xattr_user_set(struct inode *inode, const char *name,
if ( !S_ISREG(inode->i_mode) &&
(!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
return -EPERM;
#ifdef CONFIG_EXT3_FS_POSIX_ACL
error = ext3_permission_locked(inode, MAY_WRITE);
#else
error = permission(inode, MAY_WRITE, NULL);
#endif
if (error)
return error;
......
......@@ -62,6 +62,16 @@ struct ext3_inode_info {
__u32 i_prealloc_count;
#endif
__u32 i_dir_start_lookup;
#ifdef CONFIG_EXT3_FS_XATTR
/*
* Extended attributes can be read independently of the main file
* data. Taking i_sem even when reading would cause contention
* between readers of EAs and writers of regular file data, so
* instead we synchronize on xattr_sem when reading or changing
* EAs.
*/
struct rw_semaphore xattr_sem;
#endif
#ifdef CONFIG_EXT3_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment