Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

Pull assorted fixes - mostly vfs - from Al Viro: "Assorted fixes, with an unexpected detour into vfio refcounting logics (fell out when digging in an analog of eventpoll race in there)." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: task_work: add a scheduling point in task_work_run() fs: fix fs/namei.c kernel-doc warnings eventpoll: use-after-possible-free in epoll_create1() vfio: grab vfio_device reference *before* exposing the sucker via fd_install() vfio: get rid of vfio_device_put()/vfio_group_get_device* races vfio: get rid of open-coding kref_put_mutex introduce kref_put_mutex() vfio: don't dereference after kfree... mqueue: lift mnt_want_write() outside ->i_mutex, clean up a bit

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull assorted fixes - mostly vfs - from Al Viro: "Assorted fixes, with an unexpected detour into vfio refcounting logics (fell out when digging in an analog of eventpoll race in there)." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: task_work: add a scheduling point in task_work_run() fs: fix fs/namei.c kernel-doc warnings eventpoll: use-after-possible-free in epoll_create1() vfio: grab vfio_device reference *before* exposing the sucker via fd_install() vfio: get rid of vfio_device_put()/vfio_group_get_device* races vfio: get rid of open-coding kref_put_mutex introduce kref_put_mutex() vfio: don't dereference after kfree... mqueue: lift mnt_want_write() outside ->i_mutex, clean up a bit
467e9e51 · Linus Torvalds · 23dcfa61 · 88ec2789 · 467e9e51 · 467e9e51
Commit 467e9e51 authored Aug 22, 2012 by Linus Torvalds
Showing with 56 additions and 46 deletions

drivers/vfio/vfio.c drivers/vfio/vfio.c +7 -12

fs/eventpoll.c fs/eventpoll.c +1 -1

fs/namei.c fs/namei.c +2 -0

include/linux/kref.h include/linux/kref.h +18 -0

ipc/mqueue.c ipc/mqueue.c +28 -33

No files found.
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -264,6 +264,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 	return group;
 }

+/* called with vfio.group_lock held */
 static void vfio_group_release(struct kref *kref)
 {
 	struct vfio_group *group = container_of(kref, struct vfio_group, kref);
@@ -287,13 +288,7 @@ static void vfio_group_release(struct kref *kref)

 static void vfio_group_put(struct vfio_group *group)
 {
-	mutex_lock(&vfio.group_lock);
-	/*
-	 * Release needs to unlock to unregister the notifier, so only
-	 * unlock if not released.
-	 */
-	if (!kref_put(&group->kref, vfio_group_release))
-		mutex_unlock(&vfio.group_lock);
+	kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 }

 /* Assume group_lock or group reference is held */
@@ -401,7 +396,6 @@ static void vfio_device_release(struct kref *kref)
 						  struct vfio_device, kref);
 	struct vfio_group *group = device->group;

-	mutex_lock(&group->device_lock);
 	list_del(&device->group_next);
 	mutex_unlock(&group->device_lock);

@@ -416,8 +410,9 @@ static void vfio_device_release(struct kref *kref)
 /* Device reference always implies a group reference */
 static void vfio_device_put(struct vfio_device *device)
 {
-	kref_put(&device->kref, vfio_device_release);
-	vfio_group_put(device->group);
+	struct vfio_group *group = device->group;
+	kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
+	vfio_group_put(group);
 }

 static void vfio_device_get(struct vfio_device *device)
@@ -1116,10 +1111,10 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 		 */
 		filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);

-		fd_install(ret, filep);
-
 		vfio_device_get(device);
 		atomic_inc(&group->container_users);
+
+		fd_install(ret, filep);
 		break;
 	}
 	mutex_unlock(&group->device_lock);

--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1654,8 +1654,8 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
 		error = PTR_ERR(file);
 		goto out_free_fd;
 	}
-	fd_install(fd, file);
 	ep->file = file;
+	fd_install(fd, file);
 	return fd;

 out_free_fd:

--- a/fs/namei.c
+++ b/fs/namei.c
@@ -352,6 +352,7 @@ int __inode_permission(struct inode *inode, int mask)
 /**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
+ * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
@@ -656,6 +657,7 @@ int sysctl_protected_hardlinks __read_mostly = 1;
 /**
 * may_follow_link - Check symlink following for unsafe situations
 * @link: The path of the symlink
+ * @nd: nameidata pathwalk data
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is

--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -18,6 +18,7 @@
 #include <linux/bug.h>
 #include <linux/atomic.h>
 #include <linux/kernel.h>
+#include <linux/mutex.h>

 struct kref {
 	atomic_t refcount;
@@ -93,4 +94,21 @@ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)
 {
 	return kref_sub(kref, 1, release);
 }
+
+static inline int kref_put_mutex(struct kref *kref,
+				 void (*release)(struct kref *kref),
+				 struct mutex *lock)
+{
+	WARN_ON(release == NULL);
+        if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) {
+		mutex_lock(lock);
+		if (unlikely(!atomic_dec_and_test(&kref->refcount))) {
+			mutex_unlock(lock);
+			return 0;
+		}
+		release(kref);
+		return 1;
+	}
+	return 0;
+}
 #endif /* _KREF_H_ */
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -726,7 +726,6 @@ static struct file *do_create(struct ipc_namespace *ipc_ns, struct inode *dir,
 			struct mq_attr *attr)
 {
 	const struct cred *cred = current_cred();
-	struct file *result;
 	int ret;

 	if (attr) {
@@ -748,21 +747,11 @@ static struct file *do_create(struct ipc_namespace *ipc_ns, struct inode *dir,
 	}

 	mode &= ~current_umask();
-	ret = mnt_want_write(path->mnt);
-	if (ret)
-		return ERR_PTR(ret);
 	ret = vfs_create(dir, path->dentry, mode, true);
 	path->dentry->d_fsdata = NULL;
-	if (!ret)
-		result = dentry_open(path, oflag, cred);
-	else
-		result = ERR_PTR(ret);
-	/*
-	 * dentry_open() took a persistent mnt_want_write(),
-	 * so we can now drop this one.
-	 */
-	mnt_drop_write(path->mnt);
-	return result;
+	if (ret)
+		return ERR_PTR(ret);
+	return dentry_open(path, oflag, cred);
 }

 /* Opens existing queue */
@@ -788,7 +777,9 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
 	struct mq_attr attr;
 	int fd, error;
 	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
-	struct dentry *root = ipc_ns->mq_mnt->mnt_root;
+	struct vfsmount *mnt = ipc_ns->mq_mnt;
+	struct dentry *root = mnt->mnt_root;
+	int ro;

 	if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
 		return -EFAULT;
@@ -802,6 +793,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
 	if (fd < 0)
 		goto out_putname;

+	ro = mnt_want_write(mnt);	/* we'll drop it in any case */
 	error = 0;
 	mutex_lock(&root->d_inode->i_mutex);
 	path.dentry = lookup_one_len(name, root, strlen(name));
@@ -809,7 +801,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
 		error = PTR_ERR(path.dentry);
 		goto out_putfd;
 	}
-	path.mnt = mntget(ipc_ns->mq_mnt);
+	path.mnt = mntget(mnt);

 	if (oflag & O_CREAT) {
 		if (path.dentry->d_inode) {	/* entry already exists */
@@ -820,6 +812,10 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
 			}
 			filp = do_open(&path, oflag);
 		} else {
+			if (ro) {
+				error = ro;
+				goto out;
+			}
 			filp = do_create(ipc_ns, root->d_inode,
 						&path, oflag, mode,
 						u_attr ? &attr : NULL);
@@ -845,6 +841,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
 		fd = error;
 	}
 	mutex_unlock(&root->d_inode->i_mutex);
+	mnt_drop_write(mnt);
 out_putname:
 	putname(name);
 	return fd;
@@ -857,40 +854,38 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 	struct dentry *dentry;
 	struct inode *inode = NULL;
 	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+	struct vfsmount *mnt = ipc_ns->mq_mnt;

 	name = getname(u_name);
 	if (IS_ERR(name))
 		return PTR_ERR(name);

-	mutex_lock_nested(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex,
-			I_MUTEX_PARENT);
-	dentry = lookup_one_len(name, ipc_ns->mq_mnt->mnt_root, strlen(name));
+	err = mnt_want_write(mnt);
+	if (err)
+		goto out_name;
+	mutex_lock_nested(&mnt->mnt_root->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_one_len(name, mnt->mnt_root, strlen(name));
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
 		goto out_unlock;
 	}

-	if (!dentry->d_inode) {
-		err = -ENOENT;
-		goto out_err;
-	}
-
 	inode = dentry->d_inode;
-	if (inode)
+	if (!inode) {
+		err = -ENOENT;
+	} else {
 		ihold(inode);
-	err = mnt_want_write(ipc_ns->mq_mnt);
-	if (err)
-		goto out_err;
-	err = vfs_unlink(dentry->d_parent->d_inode, dentry);
-	mnt_drop_write(ipc_ns->mq_mnt);
-out_err:
+		err = vfs_unlink(dentry->d_parent->d_inode, dentry);
+	}
 	dput(dentry);

 out_unlock:
-	mutex_unlock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
-	putname(name);
+	mutex_unlock(&mnt->mnt_root->d_inode->i_mutex);
 	if (inode)
 		iput(inode);
+	mnt_drop_write(mnt);
+out_name:
+	putname(name);

 	return err;
 }