Commit 91b79ba7 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] separate locking for vfsmounts

From: Maneesh Soni <maneesh@in.ibm.com>

While path walking we do follow_mount or follow_down which uses
dcache_lock for serialisation.  vfsmount related operations also use
dcache_lock for all updates. I think we can use a separate lock for
vfsmount related work and can improve path walking.

The following two patches does the same. The first one replaces
dcache_lock with new vfsmount_lock in namespace.c. The lock is
local to namespace.c and is not required outside. The second patch
uses RCU to have lock free lookup_mnt(). The patches are quite simple
and straight forward.

The lockmeter reults show reduced contention, and lock acquisitions
for dcache_lock while running dcachebench* on a 4-way SMP box

    SPINLOCKS         HOLD            WAIT
    UTIL  CON    MEAN(  MAX )   MEAN(  MAX )(% CPU)     TOTAL NOWAIT SPIN RJECT  NAME

  baselkm-2569:
    20.7% 20.9%  0.5us( 146us)  2.9us( 144us)(0.81%)  31590840 79.1% 20.9%    0%  dcache_lock
  mntlkm-2569:
    14.3% 13.6%  0.4us( 170us)  2.9us( 187us)(0.42%)  23071746 86.4% 13.6%    0%  dcache_lock

We get more than 8% improvement on 4-way SMP and 44% improvement on 16-way
NUMAQ while runing dcachebench*.

		Average (usecs/iteration)	Std. Deviation
		(lower is better)
4-way SMP
  2.5.69	15739.3				470.90
  2.5.69-mnt	14459.6				298.51

16-way NUMAQ
  2.5.69	120426.5			363.78
  2.5.69-mnt	 63225.8			427.60

*dcachebench is a microbenchmark written by Bill Hartner and is available at
http://www-124.ibm.com/developerworks/opensource/linuxperf/dcachebench/dcachebench.html

 vfsmount_lock.patch
 -------------------
 - Patch for replacing dcache_lock with new vfsmount_lock for all mount
   related operation. This removes the need to take dcache_lock while
   doing follow_mount or follow_down operations in path walking.

I re-ran dcachebench with 2.5.70 as base on 16-way NUMAQ box.

                	Average (usecs/iteration)       Std. Deviation
                	(lower is better)
16-way NUMAQ
2.5.70 				120710.9		 	230.67
 + vfsmount_lock.patch  	65209.6				242.97
    + lookup_mnt-rcu.patch 	64042.3				416.61

So just the lock splitting (vfsmount_lock.patch) gives almost similar benifits
parent 679c40a8
......@@ -1451,19 +1451,24 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
int is_subdir(struct dentry * new_dentry, struct dentry * old_dentry)
{
int result;
unsigned long seq;
result = 0;
for (;;) {
if (new_dentry != old_dentry) {
struct dentry * parent = new_dentry->d_parent;
if (parent == new_dentry)
break;
new_dentry = parent;
continue;
do {
seq = read_seqbegin(&rename_lock);
for (;;) {
if (new_dentry != old_dentry) {
struct dentry * parent = new_dentry->d_parent;
if (parent == new_dentry)
break;
new_dentry = parent;
continue;
}
result = 1;
break;
}
result = 1;
break;
}
} while (read_seqretry(&rename_lock, seq));
return result;
}
......
......@@ -434,19 +434,17 @@ int follow_up(struct vfsmount **mnt, struct dentry **dentry)
return 1;
}
/* no need for dcache_lock, as serialization is taken care in
* namespace.c
*/
static int follow_mount(struct vfsmount **mnt, struct dentry **dentry)
{
int res = 0;
while (d_mountpoint(*dentry)) {
struct vfsmount *mounted;
spin_lock(&dcache_lock);
mounted = lookup_mnt(*mnt, *dentry);
if (!mounted) {
spin_unlock(&dcache_lock);
struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
if (!mounted)
break;
}
*mnt = mntget(mounted);
spin_unlock(&dcache_lock);
*mnt = mounted;
dput(*dentry);
mntput(mounted->mnt_parent);
*dentry = dget(mounted->mnt_root);
......@@ -455,21 +453,21 @@ static int follow_mount(struct vfsmount **mnt, struct dentry **dentry)
return res;
}
/* no need for dcache_lock, as serialization is taken care in
* namespace.c
*/
static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry)
{
struct vfsmount *mounted;
spin_lock(&dcache_lock);
mounted = lookup_mnt(*mnt, *dentry);
if (mounted) {
*mnt = mntget(mounted);
spin_unlock(&dcache_lock);
*mnt = mounted;
dput(*dentry);
mntput(mounted->mnt_parent);
*dentry = dget(mounted->mnt_root);
return 1;
}
spin_unlock(&dcache_lock);
return 0;
}
......
......@@ -26,6 +26,8 @@
extern int __init init_rootfs(void);
extern int __init sysfs_init(void);
/* spinlock for vfsmount related operations, inplace of dcache_lock */
spinlock_t vfsmount_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
static struct list_head *mount_hashtable;
static int hash_mask, hash_bits;
static kmem_cache_t *mnt_cache;
......@@ -66,30 +68,38 @@ void free_vfsmnt(struct vfsmount *mnt)
kmem_cache_free(mnt_cache, mnt);
}
/*
* Now, lookup_mnt increments the ref count before returning
* the vfsmount struct.
*/
struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
struct list_head * head = mount_hashtable + hash(mnt, dentry);
struct list_head * tmp = head;
struct vfsmount *p;
struct vfsmount *p, *found = NULL;
spin_lock(&vfsmount_lock);
for (;;) {
tmp = tmp->next;
p = NULL;
if (tmp == head)
break;
p = list_entry(tmp, struct vfsmount, mnt_hash);
if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry)
if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
found = mntget(p);
break;
}
}
return p;
spin_unlock(&vfsmount_lock);
return found;
}
static int check_mnt(struct vfsmount *mnt)
{
spin_lock(&dcache_lock);
spin_lock(&vfsmount_lock);
while (mnt->mnt_parent != mnt)
mnt = mnt->mnt_parent;
spin_unlock(&dcache_lock);
spin_unlock(&vfsmount_lock);
return mnt == current->namespace->root;
}
......@@ -263,15 +273,15 @@ void umount_tree(struct vfsmount *mnt)
mnt = list_entry(kill.next, struct vfsmount, mnt_list);
list_del_init(&mnt->mnt_list);
if (mnt->mnt_parent == mnt) {
spin_unlock(&dcache_lock);
spin_unlock(&vfsmount_lock);
} else {
struct nameidata old_nd;
detach_mnt(mnt, &old_nd);
spin_unlock(&dcache_lock);
spin_unlock(&vfsmount_lock);
path_release(&old_nd);
}
mntput(mnt);
spin_lock(&dcache_lock);
spin_lock(&vfsmount_lock);
}
}
......@@ -324,17 +334,17 @@ static int do_umount(struct vfsmount *mnt, int flags)
}
down_write(&current->namespace->sem);
spin_lock(&dcache_lock);
spin_lock(&vfsmount_lock);
if (atomic_read(&sb->s_active) == 1) {
/* last instance - try to be smart */
spin_unlock(&dcache_lock);
spin_unlock(&vfsmount_lock);
lock_kernel();
DQUOT_OFF(sb);
acct_auto_close(sb);
unlock_kernel();
security_sb_umount_close(mnt);
spin_lock(&dcache_lock);
spin_lock(&vfsmount_lock);
}
retval = -EBUSY;
if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
......@@ -342,7 +352,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
umount_tree(mnt);
retval = 0;
}
spin_unlock(&dcache_lock);
spin_unlock(&vfsmount_lock);
if (retval)
security_sb_umount_busy(mnt);
up_write(&current->namespace->sem);
......@@ -449,18 +459,18 @@ static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry)
q = clone_mnt(p, p->mnt_root);
if (!q)
goto Enomem;
spin_lock(&dcache_lock);
spin_lock(&vfsmount_lock);
list_add_tail(&q->mnt_list, &res->mnt_list);
attach_mnt(q, &nd);
spin_unlock(&dcache_lock);
spin_unlock(&vfsmount_lock);
}
}
return res;
Enomem:
if (res) {
spin_lock(&dcache_lock);
spin_lock(&vfsmount_lock);
umount_tree(res);
spin_unlock(&dcache_lock);
spin_unlock(&vfsmount_lock);
}
return NULL;
}
......@@ -485,7 +495,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
goto out_unlock;
err = -ENOENT;
spin_lock(&dcache_lock);
spin_lock(&vfsmount_lock);
if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) {
struct list_head head;
......@@ -495,7 +505,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
mntget(mnt);
err = 0;
}
spin_unlock(&dcache_lock);
spin_unlock(&vfsmount_lock);
out_unlock:
up(&nd->dentry->d_inode->i_sem);
if (!err)
......@@ -532,9 +542,9 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
if (mnt) {
err = graft_tree(mnt, nd);
if (err) {
spin_lock(&dcache_lock);
spin_lock(&vfsmount_lock);
umount_tree(mnt);
spin_unlock(&dcache_lock);
spin_unlock(&vfsmount_lock);
} else
mntput(mnt);
}
......@@ -599,7 +609,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
if (IS_DEADDIR(nd->dentry->d_inode))
goto out1;
spin_lock(&dcache_lock);
spin_lock(&vfsmount_lock);
if (!IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
goto out2;
......@@ -623,7 +633,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
detach_mnt(old_nd.mnt, &parent_nd);
attach_mnt(old_nd.mnt, nd);
out2:
spin_unlock(&dcache_lock);
spin_unlock(&vfsmount_lock);
out1:
up(&nd->dentry->d_inode->i_sem);
out:
......@@ -804,9 +814,9 @@ int copy_namespace(int flags, struct task_struct *tsk)
down_write(&tsk->namespace->sem);
/* First pass: copy the tree topology */
new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root);
spin_lock(&dcache_lock);
spin_lock(&vfsmount_lock);
list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
spin_unlock(&dcache_lock);
spin_unlock(&vfsmount_lock);
/* Second pass: switch the tsk->fs->* elements */
if (fs) {
......@@ -1027,7 +1037,7 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p
if (new_nd.mnt->mnt_root != new_nd.dentry)
goto out2; /* not a mountpoint */
tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */
spin_lock(&dcache_lock);
spin_lock(&vfsmount_lock);
if (tmp != new_nd.mnt) {
for (;;) {
if (tmp->mnt_parent == tmp)
......@@ -1044,7 +1054,7 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p
detach_mnt(user_nd.mnt, &root_parent);
attach_mnt(user_nd.mnt, &old_nd);
attach_mnt(new_nd.mnt, &root_parent);
spin_unlock(&dcache_lock);
spin_unlock(&vfsmount_lock);
chroot_fs_refs(&user_nd, &new_nd);
security_sb_post_pivotroot(&user_nd, &new_nd);
error = 0;
......@@ -1061,7 +1071,7 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p
unlock_kernel();
return error;
out3:
spin_unlock(&dcache_lock);
spin_unlock(&vfsmount_lock);
goto out2;
}
......
......@@ -307,20 +307,22 @@ static int proc_check_root(struct inode *inode)
base = dget(current->fs->root);
read_unlock(&current->fs->lock);
spin_lock(&dcache_lock);
spin_lock(&vfsmount_lock);
de = root;
mnt = vfsmnt;
while (vfsmnt != our_vfsmnt) {
if (vfsmnt == vfsmnt->mnt_parent)
if (vfsmnt == vfsmnt->mnt_parent) {
spin_unlock(&vfsmount_lock);
goto out;
}
de = vfsmnt->mnt_mountpoint;
vfsmnt = vfsmnt->mnt_parent;
}
spin_unlock(&vfsmount_lock);
if (!is_subdir(de, base))
goto out;
spin_unlock(&dcache_lock);
exit:
dput(base);
......@@ -329,7 +331,6 @@ static int proc_check_root(struct inode *inode)
mntput(mnt);
return res;
out:
spin_unlock(&dcache_lock);
res = -EACCES;
goto exit;
}
......
......@@ -54,6 +54,7 @@ extern void free_vfsmnt(struct vfsmount *mnt);
extern struct vfsmount *alloc_vfsmnt(const char *name);
extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
const char *name, void *data);
extern spinlock_t vfsmount_lock;
#endif
#endif /* _LINUX_MOUNT_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment