[PATCH] separate locking for vfsmounts

From: Maneesh Soni <maneesh@in.ibm.com> While path walking we do follow_mount or follow_down which uses dcache_lock for serialisation. vfsmount related operations also use dcache_lock for all updates. I think we can use a separate lock for vfsmount related work and can improve path walking. The following two patches does the same. The first one replaces dcache_lock with new vfsmount_lock in namespace.c. The lock is local to namespace.c and is not required outside. The second patch uses RCU to have lock free lookup_mnt(). The patches are quite simple and straight forward. The lockmeter reults show reduced contention, and lock acquisitions for dcache_lock while running dcachebench* on a 4-way SMP box SPINLOCKS HOLD WAIT UTIL CON MEAN( MAX ) MEAN( MAX )(% CPU) TOTAL NOWAIT SPIN RJECT NAME baselkm-2569: 20.7% 20.9% 0.5us( 146us) 2.9us( 144us)(0.81%) 31590840 79.1% 20.9% 0% dcache_lock mntlkm-2569: 14.3% 13.6% 0.4us( 170us) 2.9us( 187us)(0.42%) 23071746 86.4% 13.6% 0% dcache_lock We get more than 8% improvement on 4-way SMP and 44% improvement on 16-way NUMAQ while runing dcachebench*. Average (usecs/iteration) Std. Deviation (lower is better) 4-way SMP 2.5.69 15739.3 470.90 2.5.69-mnt 14459.6 298.51 16-way NUMAQ 2.5.69 120426.5 363.78 2.5.69-mnt 63225.8 427.60 *dcachebench is a microbenchmark written by Bill Hartner and is available at http://www-124.ibm.com/developerworks/opensource/linuxperf/dcachebench/dcachebench.html vfsmount_lock.patch ------------------- - Patch for replacing dcache_lock with new vfsmount_lock for all mount related operation. This removes the need to take dcache_lock while doing follow_mount or follow_down operations in path walking. I re-ran dcachebench with 2.5.70 as base on 16-way NUMAQ box. Average (usecs/iteration) Std. Deviation (lower is better) 16-way NUMAQ 2.5.70 120710.9 230.67 + vfsmount_lock.patch 65209.6 242.97 + lookup_mnt-rcu.patch 64042.3 416.61 So just the lock splitting (vfsmount_lock.patch) gives almost similar benifits

[PATCH] separate locking for vfsmounts
From: Maneesh Soni <maneesh@in.ibm.com> While path walking we do follow_mount or follow_down which uses dcache_lock for serialisation. vfsmount related operations also use dcache_lock for all updates. I think we can use a separate lock for vfsmount related work and can improve path walking. The following two patches does the same. The first one replaces dcache_lock with new vfsmount_lock in namespace.c. The lock is local to namespace.c and is not required outside. The second patch uses RCU to have lock free lookup_mnt(). The patches are quite simple and straight forward. The lockmeter reults show reduced contention, and lock acquisitions for dcache_lock while running dcachebench* on a 4-way SMP box SPINLOCKS HOLD WAIT UTIL CON MEAN( MAX ) MEAN( MAX )(% CPU) TOTAL NOWAIT SPIN RJECT NAME baselkm-2569: 20.7% 20.9% 0.5us( 146us) 2.9us( 144us)(0.81%) 31590840 79.1% 20.9% 0% dcache_lock mntlkm-2569: 14.3% 13.6% 0.4us( 170us) 2.9us( 187us)(0.42%) 23071746 86.4% 13.6% 0% dcache_lock We get more than 8% improvement on 4-way SMP and 44% improvement on 16-way NUMAQ while runing dcachebench*. Average (usecs/iteration) Std. Deviation (lower is better) 4-way SMP 2.5.69 15739.3 470.90 2.5.69-mnt 14459.6 298.51 16-way NUMAQ 2.5.69 120426.5 363.78 2.5.69-mnt 63225.8 427.60 *dcachebench is a microbenchmark written by Bill Hartner and is available at http://www-124.ibm.com/developerworks/opensource/linuxperf/dcachebench/dcachebench.html vfsmount_lock.patch ------------------- - Patch for replacing dcache_lock with new vfsmount_lock for all mount related operation. This removes the need to take dcache_lock while doing follow_mount or follow_down operations in path walking. I re-ran dcachebench with 2.5.70 as base on 16-way NUMAQ box. Average (usecs/iteration) Std. Deviation (lower is better) 16-way NUMAQ 2.5.70 120710.9 230.67 + vfsmount_lock.patch 65209.6 242.97 + lookup_mnt-rcu.patch 64042.3 416.61 So just the lock splitting (vfsmount_lock.patch) gives almost similar benifits
91b79ba7 · Andrew Morton · Linus Torvalds · 679c40a8 · 91b79ba7 · 91b79ba7
Commit 91b79ba7 authored Jul 10, 2003 by Andrew Morton Committed by Linus Torvalds Jul 10, 2003
Showing with 68 additions and 53 deletions

fs/dcache.c fs/dcache.c +15 -10

fs/namei.c fs/namei.c +10 -12

fs/namespace.c fs/namespace.c +37 -27

fs/proc/base.c fs/proc/base.c +5 -4

include/linux/mount.h include/linux/mount.h +1 -0

No files found.
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1451,19 +1451,24 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
 int is_subdir(struct dentry * new_dentry, struct dentry * old_dentry)
 {
 	int result;
+	unsigned long seq;

 	result = 0;
-	for (;;) {
-		if (new_dentry != old_dentry) {
-			struct dentry * parent = new_dentry->d_parent;
-			if (parent == new_dentry)
-				break;
-			new_dentry = parent;
-			continue;
+        do {
+		seq = read_seqbegin(&rename_lock);
+		for (;;) {
+			if (new_dentry != old_dentry) {
+				struct dentry * parent = new_dentry->d_parent;
+				if (parent == new_dentry)
+					break;
+				new_dentry = parent;
+				continue;
+			}
+			result = 1;
+			break;
 		}
-		result = 1;
-		break;
-	}
+	} while (read_seqretry(&rename_lock, seq));
+
 	return result;
 }


--- a/fs/namei.c
+++ b/fs/namei.c
@@ -434,19 +434,17 @@ int follow_up(struct vfsmount **mnt, struct dentry **dentry)
 	return 1;
 }

+/* no need for dcache_lock, as serialization is taken care in
+ * namespace.c
+ */
 static int follow_mount(struct vfsmount **mnt, struct dentry **dentry)
 {
 	int res = 0;
 	while (d_mountpoint(*dentry)) {
-		struct vfsmount *mounted;
-		spin_lock(&dcache_lock);
-		mounted = lookup_mnt(*mnt, *dentry);
-		if (!mounted) {
-			spin_unlock(&dcache_lock);
+		struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
+		if (!mounted)
 			break;
-		}
-		*mnt = mntget(mounted);
-		spin_unlock(&dcache_lock);
+		*mnt = mounted;
 		dput(*dentry);
 		mntput(mounted->mnt_parent);
 		*dentry = dget(mounted->mnt_root);
@@ -455,21 +453,21 @@ static int follow_mount(struct vfsmount **mnt, struct dentry **dentry)
 	return res;
 }

+/* no need for dcache_lock, as serialization is taken care in
+ * namespace.c
+ */
 static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry)
 {
 	struct vfsmount *mounted;

-	spin_lock(&dcache_lock);
 	mounted = lookup_mnt(*mnt, *dentry);
 	if (mounted) {
-		*mnt = mntget(mounted);
-		spin_unlock(&dcache_lock);
+		*mnt = mounted;
 		dput(*dentry);
 		mntput(mounted->mnt_parent);
 		*dentry = dget(mounted->mnt_root);
 		return 1;
 	}
-	spin_unlock(&dcache_lock);
 	return 0;
 }


--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -26,6 +26,8 @@
 extern int __init init_rootfs(void);
 extern int __init sysfs_init(void);

+/* spinlock for vfsmount related operations, inplace of dcache_lock */
+spinlock_t vfsmount_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 static struct list_head *mount_hashtable;
 static int hash_mask, hash_bits;
 static kmem_cache_t *mnt_cache; 
@@ -66,30 +68,38 @@ void free_vfsmnt(struct vfsmount *mnt)
 	kmem_cache_free(mnt_cache, mnt);
 }

+/*
+ * Now, lookup_mnt increments the ref count before returning
+ * the vfsmount struct.
+ */
 struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct list_head * head = mount_hashtable + hash(mnt, dentry);
 	struct list_head * tmp = head;
-	struct vfsmount *p;
+	struct vfsmount *p, *found = NULL;

+	spin_lock(&vfsmount_lock);
 	for (;;) {
 		tmp = tmp->next;
 		p = NULL;
 		if (tmp == head)
 			break;
 		p = list_entry(tmp, struct vfsmount, mnt_hash);
-		if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry)
+		if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
+			found = mntget(p);
 			break;
+		}
 	}
-	return p;
+	spin_unlock(&vfsmount_lock);
+	return found;
 }

 static int check_mnt(struct vfsmount *mnt)
 {
-	spin_lock(&dcache_lock);
+	spin_lock(&vfsmount_lock);
 	while (mnt->mnt_parent != mnt)
 		mnt = mnt->mnt_parent;
-	spin_unlock(&dcache_lock);
+	spin_unlock(&vfsmount_lock);
 	return mnt == current->namespace->root;
 }

@@ -263,15 +273,15 @@ void umount_tree(struct vfsmount *mnt)
 		mnt = list_entry(kill.next, struct vfsmount, mnt_list);
 		list_del_init(&mnt->mnt_list);
 		if (mnt->mnt_parent == mnt) {
-			spin_unlock(&dcache_lock);
+			spin_unlock(&vfsmount_lock);
 		} else {
 			struct nameidata old_nd;
 			detach_mnt(mnt, &old_nd);
-			spin_unlock(&dcache_lock);
+			spin_unlock(&vfsmount_lock);
 			path_release(&old_nd);
 		}
 		mntput(mnt);
-		spin_lock(&dcache_lock);
+		spin_lock(&vfsmount_lock);
 	}
 }

@@ -324,17 +334,17 @@ static int do_umount(struct vfsmount *mnt, int flags)
 	}

 	down_write(&current->namespace->sem);
-	spin_lock(&dcache_lock);
+	spin_lock(&vfsmount_lock);

 	if (atomic_read(&sb->s_active) == 1) {
 		/* last instance - try to be smart */
-		spin_unlock(&dcache_lock);
+		spin_unlock(&vfsmount_lock);
 		lock_kernel();
 		DQUOT_OFF(sb);
 		acct_auto_close(sb);
 		unlock_kernel();
 		security_sb_umount_close(mnt);
-		spin_lock(&dcache_lock);
+		spin_lock(&vfsmount_lock);
 	}
 	retval = -EBUSY;
 	if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
@@ -342,7 +352,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
 			umount_tree(mnt);
 		retval = 0;
 	}
-	spin_unlock(&dcache_lock);
+	spin_unlock(&vfsmount_lock);
 	if (retval)
 		security_sb_umount_busy(mnt);
 	up_write(&current->namespace->sem);
@@ -449,18 +459,18 @@ static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry)
 			q = clone_mnt(p, p->mnt_root);
 			if (!q)
 				goto Enomem;
-			spin_lock(&dcache_lock);
+			spin_lock(&vfsmount_lock);
 			list_add_tail(&q->mnt_list, &res->mnt_list);
 			attach_mnt(q, &nd);
-			spin_unlock(&dcache_lock);
+			spin_unlock(&vfsmount_lock);
 		}
 	}
 	return res;
 Enomem:
 	if (res) {
-		spin_lock(&dcache_lock);
+		spin_lock(&vfsmount_lock);
 		umount_tree(res);
-		spin_unlock(&dcache_lock);
+		spin_unlock(&vfsmount_lock);
 	}
 	return NULL;
 }
@@ -485,7 +495,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 		goto out_unlock;

 	err = -ENOENT;
-	spin_lock(&dcache_lock);
+	spin_lock(&vfsmount_lock);
 	if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) {
 		struct list_head head;

@@ -495,7 +505,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 		mntget(mnt);
 		err = 0;
 	}
-	spin_unlock(&dcache_lock);
+	spin_unlock(&vfsmount_lock);
 out_unlock:
 	up(&nd->dentry->d_inode->i_sem);
 	if (!err)
@@ -532,9 +542,9 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
 	if (mnt) {
 		err = graft_tree(mnt, nd);
 		if (err) {
-			spin_lock(&dcache_lock);
+			spin_lock(&vfsmount_lock);
 			umount_tree(mnt);
-			spin_unlock(&dcache_lock);
+			spin_unlock(&vfsmount_lock);
 		} else
 			mntput(mnt);
 	}
@@ -599,7 +609,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
 	if (IS_DEADDIR(nd->dentry->d_inode))
 		goto out1;

-	spin_lock(&dcache_lock);
+	spin_lock(&vfsmount_lock);
 	if (!IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
 		goto out2;

@@ -623,7 +633,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
 	detach_mnt(old_nd.mnt, &parent_nd);
 	attach_mnt(old_nd.mnt, nd);
 out2:
-	spin_unlock(&dcache_lock);
+	spin_unlock(&vfsmount_lock);
 out1:
 	up(&nd->dentry->d_inode->i_sem);
 out:
@@ -804,9 +814,9 @@ int copy_namespace(int flags, struct task_struct *tsk)
 	down_write(&tsk->namespace->sem);
 	/* First pass: copy the tree topology */
 	new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root);
-	spin_lock(&dcache_lock);
+	spin_lock(&vfsmount_lock);
 	list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&vfsmount_lock);

 	/* Second pass: switch the tsk->fs->* elements */
 	if (fs) {
@@ -1027,7 +1037,7 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p
 	if (new_nd.mnt->mnt_root != new_nd.dentry)
 		goto out2; /* not a mountpoint */
 	tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */
-	spin_lock(&dcache_lock);
+	spin_lock(&vfsmount_lock);
 	if (tmp != new_nd.mnt) {
 		for (;;) {
 			if (tmp->mnt_parent == tmp)
@@ -1044,7 +1054,7 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p
 	detach_mnt(user_nd.mnt, &root_parent);
 	attach_mnt(user_nd.mnt, &old_nd);
 	attach_mnt(new_nd.mnt, &root_parent);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&vfsmount_lock);
 	chroot_fs_refs(&user_nd, &new_nd);
 	security_sb_post_pivotroot(&user_nd, &new_nd);
 	error = 0;
@@ -1061,7 +1071,7 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p
 	unlock_kernel();
 	return error;
 out3:
-	spin_unlock(&dcache_lock);
+	spin_unlock(&vfsmount_lock);
 	goto out2;
 }


--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -307,20 +307,22 @@ static int proc_check_root(struct inode *inode)
 	base = dget(current->fs->root);
 	read_unlock(&current->fs->lock);

-	spin_lock(&dcache_lock);
+	spin_lock(&vfsmount_lock);
 	de = root;
 	mnt = vfsmnt;

 	while (vfsmnt != our_vfsmnt) {
-		if (vfsmnt == vfsmnt->mnt_parent)
+		if (vfsmnt == vfsmnt->mnt_parent) {
+			spin_unlock(&vfsmount_lock);
 			goto out;
+		}
 		de = vfsmnt->mnt_mountpoint;
 		vfsmnt = vfsmnt->mnt_parent;
 	}
+	spin_unlock(&vfsmount_lock);

 	if (!is_subdir(de, base))
 		goto out;
-	spin_unlock(&dcache_lock);

 exit:
 	dput(base);
@@ -329,7 +331,6 @@ static int proc_check_root(struct inode *inode)
 	mntput(mnt);
 	return res;
 out:
-	spin_unlock(&dcache_lock);
 	res = -EACCES;
 	goto exit;
 }

--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -54,6 +54,7 @@ extern void free_vfsmnt(struct vfsmount *mnt);
 extern struct vfsmount *alloc_vfsmnt(const char *name);
 extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
 				      const char *name, void *data);
+extern spinlock_t vfsmount_lock;

 #endif
 #endif /* _LINUX_MOUNT_H */