vfio: Delete vfio_get/put_group from vfio_iommu_group_notifier()

iommu_group_register_notifier()/iommu_group_unregister_notifier() are built using a blocking_notifier_chain which integrates a rwsem. The notifier function cannot be running outside its registration. When considering how the notifier function interacts with create/destroy of the group there are two fringe cases, the notifier starts before list_add(&vfio.group_list) and the notifier runs after the kref becomes 0. Prior to vfio_create_group() unlocking and returning we have container_users == 0 device_list == empty And this cannot change until the mutex is unlocked. After the kref goes to zero we must also have container_users == 0 device_list == empty Both are required because they are balanced operations and a 0 kref means some caller became unbalanced. Add the missing assertion that container_users must be zero as well. These two facts are important because when checking each operation we see: - IOMMU_GROUP_NOTIFY_ADD_DEVICE Empty device_list avoids the WARN_ON in vfio_group_nb_add_dev() 0 container_users ends the call - IOMMU_GROUP_NOTIFY_BOUND_DRIVER 0 container_users ends the call Finally, we have IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER, which only deletes items from the unbound list. During creation this list is empty, during kref == 0 nothing can read this list, and it will be freed soon. Since the vfio_group_release() doesn't hold the appropriate lock to manipulate the unbound_list and could race with the notifier, move the cleanup to directly before the kfree. This allows deleting all of the deferred group put code. Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/1-v3-2fdfe4ca2cc6+18c-vfio_group_cdev_jgg@nvidia.comSigned-off-by: Alex Williamson <alex.williamson@redhat.com>

vfio: Delete vfio_get/put_group from vfio_iommu_group_notifier()
iommu_group_register_notifier()/iommu_group_unregister_notifier() are built using a blocking_notifier_chain which integrates a rwsem. The notifier function cannot be running outside its registration. When considering how the notifier function interacts with create/destroy of the group there are two fringe cases, the notifier starts before list_add(&vfio.group_list) and the notifier runs after the kref becomes 0. Prior to vfio_create_group() unlocking and returning we have container_users == 0 device_list == empty And this cannot change until the mutex is unlocked. After the kref goes to zero we must also have container_users == 0 device_list == empty Both are required because they are balanced operations and a 0 kref means some caller became unbalanced. Add the missing assertion that container_users must be zero as well. These two facts are important because when checking each operation we see: - IOMMU_GROUP_NOTIFY_ADD_DEVICE Empty device_list avoids the WARN_ON in vfio_group_nb_add_dev() 0 container_users ends the call - IOMMU_GROUP_NOTIFY_BOUND_DRIVER 0 container_users ends the call Finally, we have IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER, which only deletes items from the unbound list. During creation this list is empty, during kref == 0 nothing can read this list, and it will be freed soon. Since the vfio_group_release() doesn't hold the appropriate lock to manipulate the unbound_list and could race with the notifier, move the cleanup to directly before the kfree. This allows deleting all of the deferred group put code. Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/1-v3-2fdfe4ca2cc6+18c-vfio_group_cdev_jgg@nvidia.comSigned-off-by: Alex Williamson <alex.williamson@redhat.com>
63b150fd · Jason Gunthorpe · Alex Williamson · 48f06ca4 · 63b150fd
Commit 63b150fd authored Oct 15, 2021 by Jason Gunthorpe Committed by Alex Williamson Oct 15, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 85 deletions

drivers/vfio/vfio.c drivers/vfio/vfio.c +15 -85

No files found.
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -324,12 +324,16 @@ static void vfio_container_put(struct vfio_container *container)

 static void vfio_group_unlock_and_free(struct vfio_group *group)
 {
+	struct vfio_unbound_dev *unbound, *tmp;
+
 	mutex_unlock(&vfio.group_lock);
-	/*
-	 * Unregister outside of lock.  A spurious callback is harmless now
-	 * that the group is no longer in vfio.group_list.
-	 */
 	iommu_group_unregister_notifier(group->iommu_group, &group->nb);
+
+	list_for_each_entry_safe(unbound, tmp,
+				 &group->unbound_list, unbound_next) {
+		list_del(&unbound->unbound_next);
+		kfree(unbound);
+	}
 	kfree(group);
 }

@@ -360,14 +364,6 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
 	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);

 	group->nb.notifier_call = vfio_iommu_group_notifier;
-
-	/*
-	 * blocking notifiers acquire a rwsem around registering and hold
-	 * it around callback.  Therefore, need to register outside of
-	 * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
-	 * do anything unless it can find the group in vfio.group_list, so
-	 * no harm in registering early.
-	 */
 	ret = iommu_group_register_notifier(iommu_group, &group->nb);
 	if (ret) {
 		kfree(group);
@@ -415,18 +411,18 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
 static void vfio_group_release(struct kref *kref)
 {
 	struct vfio_group *group = container_of(kref, struct vfio_group, kref);
-	struct vfio_unbound_dev *unbound, *tmp;
 	struct iommu_group *iommu_group = group->iommu_group;

+	/*
+	 * These data structures all have paired operations that can only be
+	 * undone when the caller holds a live reference on the group. Since all
+	 * pairs must be undone these WARN_ON's indicate some caller did not
+	 * properly hold the group reference.
+	 */
 	WARN_ON(!list_empty(&group->device_list));
+	WARN_ON(atomic_read(&group->container_users));
 	WARN_ON(group->notifier.head);

-	list_for_each_entry_safe(unbound, tmp,
-				 &group->unbound_list, unbound_next) {
-		list_del(&unbound->unbound_next);
-		kfree(unbound);
-	}
-
 	device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 	list_del(&group->vfio_next);
 	vfio_free_group_minor(group->minor);
@@ -439,61 +435,12 @@ static void vfio_group_put(struct vfio_group *group)
 	kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 }

-struct vfio_group_put_work {
-	struct work_struct work;
-	struct vfio_group *group;
-};
-
-static void vfio_group_put_bg(struct work_struct *work)
-{
-	struct vfio_group_put_work *do_work;
-
-	do_work = container_of(work, struct vfio_group_put_work, work);
-
-	vfio_group_put(do_work->group);
-	kfree(do_work);
-}
-
-static void vfio_group_schedule_put(struct vfio_group *group)
-{
-	struct vfio_group_put_work *do_work;
-
-	do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
-	if (WARN_ON(!do_work))
-		return;
-
-	INIT_WORK(&do_work->work, vfio_group_put_bg);
-	do_work->group = group;
-	schedule_work(&do_work->work);
-}
-
 /* Assume group_lock or group reference is held */
 static void vfio_group_get(struct vfio_group *group)
 {
 	kref_get(&group->kref);
 }

-/*
- * Not really a try as we will sleep for mutex, but we need to make
- * sure the group pointer is valid under lock and get a reference.
- */
-static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
-{
-	struct vfio_group *target = group;
-
-	mutex_lock(&vfio.group_lock);
-	list_for_each_entry(group, &vfio.group_list, vfio_next) {
-		if (group == target) {
-			vfio_group_get(group);
-			mutex_unlock(&vfio.group_lock);
-			return group;
-		}
-	}
-	mutex_unlock(&vfio.group_lock);
-
-	return NULL;
-}
-
 static
 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 {
@@ -691,14 +638,6 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb,
 	struct device *dev = data;
 	struct vfio_unbound_dev *unbound;

-	/*
-	 * Need to go through a group_lock lookup to get a reference or we
-	 * risk racing a group being removed.  Ignore spurious notifies.
-	 */
-	group = vfio_group_try_get(group);
-	if (!group)
-		return NOTIFY_OK;
-
 	switch (action) {
 	case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 		vfio_group_nb_add_dev(group, dev);
@@ -749,15 +688,6 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb,
 		mutex_unlock(&group->unbound_lock);
 		break;
 	}
-
-	/*
-	 * If we're the last reference to the group, the group will be
-	 * released, which includes unregistering the iommu group notifier.
-	 * We hold a read-lock on that notifier list, unregistering needs
-	 * a write-lock... deadlock.  Release our reference asynchronously
-	 * to avoid that situation.
-	 */
-	vfio_group_schedule_put(group);
 	return NOTIFY_OK;
 }