vfio.c 55.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Alex Williamson's avatar
Alex Williamson committed
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/*
 * VFIO core
 *
 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
 *     Author: Alex Williamson <alex.williamson@redhat.com>
 *
 * Derived from original vfio:
 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
 * Author: Tom Lyon, pugs@cisco.com
 */

#include <linux/cdev.h>
#include <linux/compat.h>
#include <linux/device.h>
#include <linux/file.h>
#include <linux/anon_inodes.h>
#include <linux/fs.h>
#include <linux/idr.h>
#include <linux/iommu.h>
#include <linux/list.h>
22
#include <linux/miscdevice.h>
Alex Williamson's avatar
Alex Williamson committed
23 24
#include <linux/module.h>
#include <linux/mutex.h>
25
#include <linux/pci.h>
26
#include <linux/rwsem.h>
Alex Williamson's avatar
Alex Williamson committed
27 28
#include <linux/sched.h>
#include <linux/slab.h>
29
#include <linux/stat.h>
Alex Williamson's avatar
Alex Williamson committed
30 31 32 33
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include <linux/wait.h>
34
#include <linux/sched/signal.h>
35
#include "vfio.h"
Alex Williamson's avatar
Alex Williamson committed
36 37 38 39 40 41 42 43 44 45

#define DRIVER_VERSION	"0.3"
#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
#define DRIVER_DESC	"VFIO - User Level meta-driver"

static struct vfio {
	struct class			*class;
	struct list_head		iommu_drivers_list;
	struct mutex			iommu_drivers_lock;
	struct list_head		group_list;
46 47
	struct mutex			group_lock; /* locks group_list */
	struct ida			group_ida;
48
	dev_t				group_devt;
Alex Williamson's avatar
Alex Williamson committed
49 50 51 52 53 54 55 56 57 58
} vfio;

struct vfio_iommu_driver {
	const struct vfio_iommu_driver_ops	*ops;
	struct list_head			vfio_next;
};

struct vfio_container {
	struct kref			kref;
	struct list_head		group_list;
59
	struct rw_semaphore		group_lock;
Alex Williamson's avatar
Alex Williamson committed
60 61
	struct vfio_iommu_driver	*iommu_driver;
	void				*iommu_data;
62
	bool				noiommu;
Alex Williamson's avatar
Alex Williamson committed
63 64 65
};

struct vfio_group {
66 67
	struct device 			dev;
	struct cdev			cdev;
68
	refcount_t			users;
69
	unsigned int			container_users;
Alex Williamson's avatar
Alex Williamson committed
70 71 72 73 74 75
	struct iommu_group		*iommu_group;
	struct vfio_container		*container;
	struct list_head		device_list;
	struct mutex			device_lock;
	struct list_head		vfio_next;
	struct list_head		container_next;
76
	enum vfio_group_type		type;
77
	unsigned int			dev_counter;
78
	struct rw_semaphore		group_rwsem;
79
	struct kvm			*kvm;
80
	struct file			*opened_file;
81
	struct blocking_notifier_head	notifier;
Alex Williamson's avatar
Alex Williamson committed
82 83
};

84 85 86 87 88 89 90
#ifdef CONFIG_VFIO_NOIOMMU
static bool noiommu __read_mostly;
module_param_named(enable_unsafe_noiommu_mode,
		   noiommu, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
#endif

91
static DEFINE_XARRAY(vfio_device_set_xa);
92
static const struct file_operations vfio_group_fops;
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164

int vfio_assign_device_set(struct vfio_device *device, void *set_id)
{
	unsigned long idx = (unsigned long)set_id;
	struct vfio_device_set *new_dev_set;
	struct vfio_device_set *dev_set;

	if (WARN_ON(!set_id))
		return -EINVAL;

	/*
	 * Atomically acquire a singleton object in the xarray for this set_id
	 */
	xa_lock(&vfio_device_set_xa);
	dev_set = xa_load(&vfio_device_set_xa, idx);
	if (dev_set)
		goto found_get_ref;
	xa_unlock(&vfio_device_set_xa);

	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
	if (!new_dev_set)
		return -ENOMEM;
	mutex_init(&new_dev_set->lock);
	INIT_LIST_HEAD(&new_dev_set->device_list);
	new_dev_set->set_id = set_id;

	xa_lock(&vfio_device_set_xa);
	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
			       GFP_KERNEL);
	if (!dev_set) {
		dev_set = new_dev_set;
		goto found_get_ref;
	}

	kfree(new_dev_set);
	if (xa_is_err(dev_set)) {
		xa_unlock(&vfio_device_set_xa);
		return xa_err(dev_set);
	}

found_get_ref:
	dev_set->device_count++;
	xa_unlock(&vfio_device_set_xa);
	mutex_lock(&dev_set->lock);
	device->dev_set = dev_set;
	list_add_tail(&device->dev_set_list, &dev_set->device_list);
	mutex_unlock(&dev_set->lock);
	return 0;
}
EXPORT_SYMBOL_GPL(vfio_assign_device_set);

static void vfio_release_device_set(struct vfio_device *device)
{
	struct vfio_device_set *dev_set = device->dev_set;

	if (!dev_set)
		return;

	mutex_lock(&dev_set->lock);
	list_del(&device->dev_set_list);
	mutex_unlock(&dev_set->lock);

	xa_lock(&vfio_device_set_xa);
	if (!--dev_set->device_count) {
		__xa_erase(&vfio_device_set_xa,
			   (unsigned long)dev_set->set_id);
		mutex_destroy(&dev_set->lock);
		kfree(dev_set);
	}
	xa_unlock(&vfio_device_set_xa);
}

165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
#ifdef CONFIG_VFIO_NOIOMMU
static void *vfio_noiommu_open(unsigned long arg)
{
	if (arg != VFIO_NOIOMMU_IOMMU)
		return ERR_PTR(-EINVAL);
	if (!capable(CAP_SYS_RAWIO))
		return ERR_PTR(-EPERM);

	return NULL;
}

static void vfio_noiommu_release(void *iommu_data)
{
}

static long vfio_noiommu_ioctl(void *iommu_data,
			       unsigned int cmd, unsigned long arg)
{
	if (cmd == VFIO_CHECK_EXTENSION)
		return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;

	return -ENOTTY;
}

static int vfio_noiommu_attach_group(void *iommu_data,
190
		struct iommu_group *iommu_group, enum vfio_group_type type)
191
{
192
	return 0;
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
}

static void vfio_noiommu_detach_group(void *iommu_data,
				      struct iommu_group *iommu_group)
{
}

static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
	.name = "vfio-noiommu",
	.owner = THIS_MODULE,
	.open = vfio_noiommu_open,
	.release = vfio_noiommu_release,
	.ioctl = vfio_noiommu_ioctl,
	.attach_group = vfio_noiommu_attach_group,
	.detach_group = vfio_noiommu_detach_group,
};

210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
/*
 * Only noiommu containers can use vfio-noiommu and noiommu containers can only
 * use vfio-noiommu.
 */
static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
		const struct vfio_iommu_driver *driver)
{
	return container->noiommu == (driver->ops == &vfio_noiommu_ops);
}
#else
static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
		const struct vfio_iommu_driver *driver)
{
	return true;
}
#endif /* CONFIG_VFIO_NOIOMMU */
226

227
/*
Alex Williamson's avatar
Alex Williamson committed
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
 * IOMMU driver registration
 */
int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
{
	struct vfio_iommu_driver *driver, *tmp;

	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
	if (!driver)
		return -ENOMEM;

	driver->ops = ops;

	mutex_lock(&vfio.iommu_drivers_lock);

	/* Check for duplicates */
	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
		if (tmp->ops == ops) {
			mutex_unlock(&vfio.iommu_drivers_lock);
			kfree(driver);
			return -EINVAL;
		}
	}

	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);

	mutex_unlock(&vfio.iommu_drivers_lock);

	return 0;
}
EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);

void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
{
	struct vfio_iommu_driver *driver;

	mutex_lock(&vfio.iommu_drivers_lock);
	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
		if (driver->ops == ops) {
			list_del(&driver->vfio_next);
			mutex_unlock(&vfio.iommu_drivers_lock);
			kfree(driver);
			return;
		}
	}
	mutex_unlock(&vfio.iommu_drivers_lock);
}
EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);

static void vfio_group_get(struct vfio_group *group);

278
/*
Alex Williamson's avatar
Alex Williamson committed
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
 * Container objects - containers are created when /dev/vfio/vfio is
 * opened, but their lifecycle extends until the last user is done, so
 * it's freed via kref.  Must support container/group/device being
 * closed in any order.
 */
static void vfio_container_get(struct vfio_container *container)
{
	kref_get(&container->kref);
}

static void vfio_container_release(struct kref *kref)
{
	struct vfio_container *container;
	container = container_of(kref, struct vfio_container, kref);

	kfree(container);
}

static void vfio_container_put(struct vfio_container *container)
{
	kref_put(&container->kref, vfio_container_release);
}

302
/*
Alex Williamson's avatar
Alex Williamson committed
303 304
 * Group objects - create, release, get, put, search
 */
305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
static struct vfio_group *
__vfio_group_get_from_iommu(struct iommu_group *iommu_group)
{
	struct vfio_group *group;

	list_for_each_entry(group, &vfio.group_list, vfio_next) {
		if (group->iommu_group == iommu_group) {
			vfio_group_get(group);
			return group;
		}
	}
	return NULL;
}

static struct vfio_group *
vfio_group_get_from_iommu(struct iommu_group *iommu_group)
{
	struct vfio_group *group;

	mutex_lock(&vfio.group_lock);
	group = __vfio_group_get_from_iommu(iommu_group);
	mutex_unlock(&vfio.group_lock);
	return group;
}

330
static void vfio_group_release(struct device *dev)
Alex Williamson's avatar
Alex Williamson committed
331
{
332 333 334 335 336 337 338 339 340 341 342 343 344
	struct vfio_group *group = container_of(dev, struct vfio_group, dev);

	mutex_destroy(&group->device_lock);
	iommu_group_put(group->iommu_group);
	ida_free(&vfio.group_ida, MINOR(group->dev.devt));
	kfree(group);
}

static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
					   enum vfio_group_type type)
{
	struct vfio_group *group;
	int minor;
Alex Williamson's avatar
Alex Williamson committed
345 346 347 348 349

	group = kzalloc(sizeof(*group), GFP_KERNEL);
	if (!group)
		return ERR_PTR(-ENOMEM);

350 351 352 353 354 355 356 357 358 359 360 361 362
	minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
	if (minor < 0) {
		kfree(group);
		return ERR_PTR(minor);
	}

	device_initialize(&group->dev);
	group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
	group->dev.class = vfio.class;
	group->dev.release = vfio_group_release;
	cdev_init(&group->cdev, &vfio_group_fops);
	group->cdev.owner = THIS_MODULE;

363
	refcount_set(&group->users, 1);
364
	init_rwsem(&group->group_rwsem);
Alex Williamson's avatar
Alex Williamson committed
365 366 367
	INIT_LIST_HEAD(&group->device_list);
	mutex_init(&group->device_lock);
	group->iommu_group = iommu_group;
368
	/* put in vfio_group_release() */
369
	iommu_group_ref_get(iommu_group);
370
	group->type = type;
371
	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
Alex Williamson's avatar
Alex Williamson committed
372

373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
	return group;
}

static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
		enum vfio_group_type type)
{
	struct vfio_group *group;
	struct vfio_group *ret;
	int err;

	group = vfio_group_alloc(iommu_group, type);
	if (IS_ERR(group))
		return group;

	err = dev_set_name(&group->dev, "%s%d",
			   group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
			   iommu_group_id(iommu_group));
	if (err) {
		ret = ERR_PTR(err);
		goto err_put;
	}

Alex Williamson's avatar
Alex Williamson committed
395 396 397
	mutex_lock(&vfio.group_lock);

	/* Did we race creating this group? */
398 399 400
	ret = __vfio_group_get_from_iommu(iommu_group);
	if (ret)
		goto err_unlock;
401

402 403 404 405
	err = cdev_device_add(&group->cdev, &group->dev);
	if (err) {
		ret = ERR_PTR(err);
		goto err_unlock;
Alex Williamson's avatar
Alex Williamson committed
406 407 408 409 410 411
	}

	list_add(&group->vfio_next, &vfio.group_list);

	mutex_unlock(&vfio.group_lock);
	return group;
412 413 414 415 416 417

err_unlock:
	mutex_unlock(&vfio.group_lock);
err_put:
	put_device(&group->dev);
	return ret;
Alex Williamson's avatar
Alex Williamson committed
418 419
}

420
static void vfio_group_put(struct vfio_group *group)
Alex Williamson's avatar
Alex Williamson committed
421
{
422 423
	if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
		return;
Alex Williamson's avatar
Alex Williamson committed
424

425 426 427 428 429 430
	/*
	 * These data structures all have paired operations that can only be
	 * undone when the caller holds a live reference on the group. Since all
	 * pairs must be undone these WARN_ON's indicate some caller did not
	 * properly hold the group reference.
	 */
Alex Williamson's avatar
Alex Williamson committed
431
	WARN_ON(!list_empty(&group->device_list));
432
	WARN_ON(group->container || group->container_users);
433
	WARN_ON(group->notifier.head);
Alex Williamson's avatar
Alex Williamson committed
434 435

	list_del(&group->vfio_next);
436 437 438 439
	cdev_device_del(&group->cdev, &group->dev);
	mutex_unlock(&vfio.group_lock);

	put_device(&group->dev);
Alex Williamson's avatar
Alex Williamson committed
440 441 442 443
}

static void vfio_group_get(struct vfio_group *group)
{
444
	refcount_inc(&group->users);
Alex Williamson's avatar
Alex Williamson committed
445 446
}

447
/*
Alex Williamson's avatar
Alex Williamson committed
448 449 450
 * Device objects - create, release, get, put, search
 */
/* Device reference always implies a group reference */
451
static void vfio_device_put(struct vfio_device *device)
Alex Williamson's avatar
Alex Williamson committed
452
{
453 454
	if (refcount_dec_and_test(&device->refcount))
		complete(&device->comp);
Alex Williamson's avatar
Alex Williamson committed
455 456
}

457
static bool vfio_device_try_get(struct vfio_device *device)
Alex Williamson's avatar
Alex Williamson committed
458
{
459
	return refcount_inc_not_zero(&device->refcount);
Alex Williamson's avatar
Alex Williamson committed
460 461 462 463 464 465 466 467 468
}

static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
						 struct device *dev)
{
	struct vfio_device *device;

	mutex_lock(&group->device_lock);
	list_for_each_entry(device, &group->device_list, group_next) {
469
		if (device->dev == dev && vfio_device_try_get(device)) {
Alex Williamson's avatar
Alex Williamson committed
470 471 472 473 474 475 476 477
			mutex_unlock(&group->device_lock);
			return device;
		}
	}
	mutex_unlock(&group->device_lock);
	return NULL;
}

478
/*
Alex Williamson's avatar
Alex Williamson committed
479 480
 * VFIO driver API
 */
481
void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
482
			 const struct vfio_device_ops *ops)
Alex Williamson's avatar
Alex Williamson committed
483
{
484 485 486 487 488 489
	init_completion(&device->comp);
	device->dev = dev;
	device->ops = ops;
}
EXPORT_SYMBOL_GPL(vfio_init_group_dev);

490 491
void vfio_uninit_group_dev(struct vfio_device *device)
{
492
	vfio_release_device_set(device);
493 494 495
}
EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);

496 497
static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
		enum vfio_group_type type)
498 499 500
{
	struct iommu_group *iommu_group;
	struct vfio_group *group;
501 502 503 504 505 506 507 508 509 510
	int ret;

	iommu_group = iommu_group_alloc();
	if (IS_ERR(iommu_group))
		return ERR_CAST(iommu_group);

	iommu_group_set_name(iommu_group, "vfio-noiommu");
	ret = iommu_group_add_device(iommu_group, dev);
	if (ret)
		goto out_put_group;
511

512
	group = vfio_create_group(iommu_group, type);
513 514 515 516
	if (IS_ERR(group)) {
		ret = PTR_ERR(group);
		goto out_remove_device;
	}
517
	iommu_group_put(iommu_group);
518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
	return group;

out_remove_device:
	iommu_group_remove_device(dev);
out_put_group:
	iommu_group_put(iommu_group);
	return ERR_PTR(ret);
}

static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
{
	struct iommu_group *iommu_group;
	struct vfio_group *group;

	iommu_group = iommu_group_get(dev);
#ifdef CONFIG_VFIO_NOIOMMU
534
	if (!iommu_group && noiommu) {
535 536
		/*
		 * With noiommu enabled, create an IOMMU group for devices that
537 538
		 * don't already have one, implying no IOMMU hardware/driver
		 * exists.  Taint the kernel because we're about to give a DMA
539 540
		 * capable device to a user without IOMMU protection.
		 */
541
		group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
542 543 544 545 546 547 548
		if (!IS_ERR(group)) {
			add_taint(TAINT_USER, LOCKDEP_STILL_OK);
			dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
		}
		return group;
	}
#endif
549 550 551 552
	if (!iommu_group)
		return ERR_PTR(-EINVAL);

	group = vfio_group_get_from_iommu(iommu_group);
553 554
	if (!group)
		group = vfio_create_group(iommu_group, VFIO_IOMMU);
555

556
	/* The vfio_group holds a reference to the iommu_group */
557 558 559 560
	iommu_group_put(iommu_group);
	return group;
}

561 562
static int __vfio_register_dev(struct vfio_device *device,
		struct vfio_group *group)
563 564
{
	struct vfio_device *existing_device;
565 566 567

	if (IS_ERR(group))
		return PTR_ERR(group);
Alex Williamson's avatar
Alex Williamson committed
568

569 570 571 572 573 574 575
	/*
	 * If the driver doesn't specify a set then the device is added to a
	 * singleton set just for itself.
	 */
	if (!device->dev_set)
		vfio_assign_device_set(device, device);

576 577 578
	existing_device = vfio_group_get_device(group, device->dev);
	if (existing_device) {
		dev_WARN(device->dev, "Device already exists on group %d\n",
579
			 iommu_group_id(group->iommu_group));
580
		vfio_device_put(existing_device);
581 582
		if (group->type == VFIO_NO_IOMMU ||
		    group->type == VFIO_EMULATED_IOMMU)
583
			iommu_group_remove_device(device->dev);
Alex Williamson's avatar
Alex Williamson committed
584 585 586 587
		vfio_group_put(group);
		return -EBUSY;
	}

588 589 590 591 592 593 594 595 596 597 598 599 600
	/* Our reference on group is moved to the device */
	device->group = group;

	/* Refcounting can't start until the driver calls register */
	refcount_set(&device->refcount, 1);

	mutex_lock(&group->device_lock);
	list_add(&device->group_next, &group->device_list);
	group->dev_counter++;
	mutex_unlock(&group->device_lock);

	return 0;
}
601 602 603

int vfio_register_group_dev(struct vfio_device *device)
{
604 605 606 607 608 609 610
	/*
	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
	 * restore cache coherency.
	 */
	if (!iommu_capable(device->dev->bus, IOMMU_CAP_CACHE_COHERENCY))
		return -EINVAL;

611 612 613
	return __vfio_register_dev(device,
		vfio_group_find_or_alloc(device->dev));
}
614 615
EXPORT_SYMBOL_GPL(vfio_register_group_dev);

616 617 618 619 620 621 622 623 624 625 626
/*
 * Register a virtual device without IOMMU backing.  The user of this
 * device must not be able to directly trigger unmediated DMA.
 */
int vfio_register_emulated_iommu_dev(struct vfio_device *device)
{
	return __vfio_register_dev(device,
		vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
}
EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);

Alex Williamson's avatar
Alex Williamson committed
627 628 629
static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
						     char *buf)
{
630
	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
Alex Williamson's avatar
Alex Williamson committed
631 632

	mutex_lock(&group->device_lock);
633
	list_for_each_entry(it, &group->device_list, group_next) {
634 635 636
		int ret;

		if (it->ops->match) {
637
			ret = it->ops->match(it, buf);
638 639 640 641 642 643 644 645
			if (ret < 0) {
				device = ERR_PTR(ret);
				break;
			}
		} else {
			ret = !strcmp(dev_name(it->dev), buf);
		}

646
		if (ret && vfio_device_try_get(it)) {
647
			device = it;
Alex Williamson's avatar
Alex Williamson committed
648 649 650 651 652 653 654 655
			break;
		}
	}
	mutex_unlock(&group->device_lock);

	return device;
}

Alex Williamson's avatar
Alex Williamson committed
656 657 658
/*
 * Decrement the device reference count and wait for the device to be
 * removed.  Open file descriptors for the device... */
659
void vfio_unregister_group_dev(struct vfio_device *device)
Alex Williamson's avatar
Alex Williamson committed
660 661
{
	struct vfio_group *group = device->group;
662
	unsigned int i = 0;
663
	bool interrupted = false;
664
	long rc;
Alex Williamson's avatar
Alex Williamson committed
665 666

	vfio_device_put(device);
667 668
	rc = try_wait_for_completion(&device->comp);
	while (rc <= 0) {
669
		if (device->ops->request)
670
			device->ops->request(device, i++);
671

672
		if (interrupted) {
673 674
			rc = wait_for_completion_timeout(&device->comp,
							 HZ * 10);
675
		} else {
676 677 678
			rc = wait_for_completion_interruptible_timeout(
				&device->comp, HZ * 10);
			if (rc < 0) {
679
				interrupted = true;
680
				dev_warn(device->dev,
681 682 683 684 685 686
					 "Device is currently in use, task"
					 " \"%s\" (%d) "
					 "blocked until device is released",
					 current->comm, task_pid_nr(current));
			}
		}
687
	}
688

689 690 691 692
	mutex_lock(&group->device_lock);
	list_del(&device->group_next);
	group->dev_counter--;
	mutex_unlock(&group->device_lock);
693

694
	if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
695
		iommu_group_remove_device(device->dev);
696

697
	/* Matches the get in vfio_register_group_dev() */
698
	vfio_group_put(group);
699 700 701
}
EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);

702
/*
Alex Williamson's avatar
Alex Williamson committed
703 704 705 706 707
 * VFIO base fd, /dev/vfio/vfio
 */
static long vfio_ioctl_check_extension(struct vfio_container *container,
				       unsigned long arg)
{
708
	struct vfio_iommu_driver *driver;
Alex Williamson's avatar
Alex Williamson committed
709 710
	long ret = 0;

711 712 713 714
	down_read(&container->group_lock);

	driver = container->iommu_driver;

Alex Williamson's avatar
Alex Williamson committed
715 716 717 718 719 720 721 722 723 724 725
	switch (arg) {
		/* No base extensions yet */
	default:
		/*
		 * If no driver is set, poll all registered drivers for
		 * extensions and return the first positive result.  If
		 * a driver is already set, further queries will be passed
		 * only to that driver.
		 */
		if (!driver) {
			mutex_lock(&vfio.iommu_drivers_lock);
726 727
			list_for_each_entry(driver, &vfio.iommu_drivers_list,
					    vfio_next) {
728 729

				if (!list_empty(&container->group_list) &&
730 731
				    !vfio_iommu_driver_allowed(container,
							       driver))
732
					continue;
Alex Williamson's avatar
Alex Williamson committed
733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748
				if (!try_module_get(driver->ops->owner))
					continue;

				ret = driver->ops->ioctl(NULL,
							 VFIO_CHECK_EXTENSION,
							 arg);
				module_put(driver->ops->owner);
				if (ret > 0)
					break;
			}
			mutex_unlock(&vfio.iommu_drivers_lock);
		} else
			ret = driver->ops->ioctl(container->iommu_data,
						 VFIO_CHECK_EXTENSION, arg);
	}

749 750
	up_read(&container->group_lock);

Alex Williamson's avatar
Alex Williamson committed
751 752 753
	return ret;
}

754
/* hold write lock on container->group_lock */
Alex Williamson's avatar
Alex Williamson committed
755 756 757 758 759 760 761 762
static int __vfio_container_attach_groups(struct vfio_container *container,
					  struct vfio_iommu_driver *driver,
					  void *data)
{
	struct vfio_group *group;
	int ret = -ENODEV;

	list_for_each_entry(group, &container->group_list, container_next) {
763 764
		ret = driver->ops->attach_group(data, group->iommu_group,
						group->type);
Alex Williamson's avatar
Alex Williamson committed
765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785
		if (ret)
			goto unwind;
	}

	return ret;

unwind:
	list_for_each_entry_continue_reverse(group, &container->group_list,
					     container_next) {
		driver->ops->detach_group(data, group->iommu_group);
	}

	return ret;
}

static long vfio_ioctl_set_iommu(struct vfio_container *container,
				 unsigned long arg)
{
	struct vfio_iommu_driver *driver;
	long ret = -ENODEV;

786
	down_write(&container->group_lock);
Alex Williamson's avatar
Alex Williamson committed
787 788 789 790 791 792 793 794 795 796

	/*
	 * The container is designed to be an unprivileged interface while
	 * the group can be assigned to specific users.  Therefore, only by
	 * adding a group to a container does the user get the privilege of
	 * enabling the iommu, which may allocate finite resources.  There
	 * is no unset_iommu, but by removing all the groups from a container,
	 * the container is deprivileged and returns to an unset state.
	 */
	if (list_empty(&container->group_list) || container->iommu_driver) {
797
		up_write(&container->group_lock);
Alex Williamson's avatar
Alex Williamson committed
798 799 800 801
		return -EINVAL;
	}

	mutex_lock(&vfio.iommu_drivers_lock);
802
	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
Alex Williamson's avatar
Alex Williamson committed
803 804
		void *data;

805
		if (!vfio_iommu_driver_allowed(container, driver))
806
			continue;
Alex Williamson's avatar
Alex Williamson committed
807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825
		if (!try_module_get(driver->ops->owner))
			continue;

		/*
		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
		 * so test which iommu driver reported support for this
		 * extension and call open on them.  We also pass them the
		 * magic, allowing a single driver to support multiple
		 * interfaces if they'd like.
		 */
		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
			module_put(driver->ops->owner);
			continue;
		}

		data = driver->ops->open(arg);
		if (IS_ERR(data)) {
			ret = PTR_ERR(data);
			module_put(driver->ops->owner);
826
			continue;
Alex Williamson's avatar
Alex Williamson committed
827 828 829
		}

		ret = __vfio_container_attach_groups(container, driver, data);
830
		if (ret) {
Alex Williamson's avatar
Alex Williamson committed
831 832
			driver->ops->release(data);
			module_put(driver->ops->owner);
833
			continue;
Alex Williamson's avatar
Alex Williamson committed
834 835
		}

836 837 838
		container->iommu_driver = driver;
		container->iommu_data = data;
		break;
Alex Williamson's avatar
Alex Williamson committed
839 840 841
	}

	mutex_unlock(&vfio.iommu_drivers_lock);
842
	up_write(&container->group_lock);
Alex Williamson's avatar
Alex Williamson committed
843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868

	return ret;
}

static long vfio_fops_unl_ioctl(struct file *filep,
				unsigned int cmd, unsigned long arg)
{
	struct vfio_container *container = filep->private_data;
	struct vfio_iommu_driver *driver;
	void *data;
	long ret = -EINVAL;

	if (!container)
		return ret;

	switch (cmd) {
	case VFIO_GET_API_VERSION:
		ret = VFIO_API_VERSION;
		break;
	case VFIO_CHECK_EXTENSION:
		ret = vfio_ioctl_check_extension(container, arg);
		break;
	case VFIO_SET_IOMMU:
		ret = vfio_ioctl_set_iommu(container, arg);
		break;
	default:
869 870 871
		driver = container->iommu_driver;
		data = container->iommu_data;

Alex Williamson's avatar
Alex Williamson committed
872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887
		if (driver) /* passthrough all unrecognized ioctls */
			ret = driver->ops->ioctl(data, cmd, arg);
	}

	return ret;
}

static int vfio_fops_open(struct inode *inode, struct file *filep)
{
	struct vfio_container *container;

	container = kzalloc(sizeof(*container), GFP_KERNEL);
	if (!container)
		return -ENOMEM;

	INIT_LIST_HEAD(&container->group_list);
888
	init_rwsem(&container->group_lock);
Alex Williamson's avatar
Alex Williamson committed
889 890 891 892 893 894 895 896 897 898
	kref_init(&container->kref);

	filep->private_data = container;

	return 0;
}

static int vfio_fops_release(struct inode *inode, struct file *filep)
{
	struct vfio_container *container = filep->private_data;
899 900 901 902 903
	struct vfio_iommu_driver *driver = container->iommu_driver;

	if (driver && driver->ops->notify)
		driver->ops->notify(container->iommu_data,
				    VFIO_IOMMU_CONTAINER_CLOSE);
Alex Williamson's avatar
Alex Williamson committed
904 905 906 907 908 909 910 911 912 913 914 915 916

	filep->private_data = NULL;

	vfio_container_put(container);

	return 0;
}

static const struct file_operations vfio_fops = {
	.owner		= THIS_MODULE,
	.open		= vfio_fops_open,
	.release	= vfio_fops_release,
	.unlocked_ioctl	= vfio_fops_unl_ioctl,
917
	.compat_ioctl	= compat_ptr_ioctl,
Alex Williamson's avatar
Alex Williamson committed
918 919
};

920
/*
Alex Williamson's avatar
Alex Williamson committed
921 922 923 924 925 926 927
 * VFIO Group fd, /dev/vfio/$GROUP
 */
static void __vfio_group_unset_container(struct vfio_group *group)
{
	struct vfio_container *container = group->container;
	struct vfio_iommu_driver *driver;

928 929
	lockdep_assert_held_write(&group->group_rwsem);

930
	down_write(&container->group_lock);
Alex Williamson's avatar
Alex Williamson committed
931 932 933 934 935 936

	driver = container->iommu_driver;
	if (driver)
		driver->ops->detach_group(container->iommu_data,
					  group->iommu_group);

937 938
	if (group->type == VFIO_IOMMU)
		iommu_group_release_dma_owner(group->iommu_group);
939

Alex Williamson's avatar
Alex Williamson committed
940
	group->container = NULL;
941
	group->container_users = 0;
Alex Williamson's avatar
Alex Williamson committed
942 943 944 945 946 947 948 949 950 951
	list_del(&group->container_next);

	/* Detaching the last group deprivileges a container, remove iommu */
	if (driver && list_empty(&container->group_list)) {
		driver->ops->release(container->iommu_data);
		module_put(driver->ops->owner);
		container->iommu_driver = NULL;
		container->iommu_data = NULL;
	}

952
	up_write(&container->group_lock);
Alex Williamson's avatar
Alex Williamson committed
953 954 955 956 957 958 959 960 961 962 963 964

	vfio_container_put(container);
}

/*
 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
 * if there was no container to unset.  Since the ioctl is called on
 * the group, we know that still exists, therefore the only valid
 * transition here is 1->0.
 */
static int vfio_group_unset_container(struct vfio_group *group)
{
965
	lockdep_assert_held_write(&group->group_rwsem);
Alex Williamson's avatar
Alex Williamson committed
966

967
	if (!group->container)
Alex Williamson's avatar
Alex Williamson committed
968
		return -EINVAL;
969
	if (group->container_users != 1)
Alex Williamson's avatar
Alex Williamson committed
970 971 972 973 974 975 976
		return -EBUSY;
	__vfio_group_unset_container(group);
	return 0;
}

static int vfio_group_set_container(struct vfio_group *group, int container_fd)
{
977
	struct fd f;
Alex Williamson's avatar
Alex Williamson committed
978 979
	struct vfio_container *container;
	struct vfio_iommu_driver *driver;
980
	int ret = 0;
Alex Williamson's avatar
Alex Williamson committed
981

982 983
	lockdep_assert_held_write(&group->group_rwsem);

984
	if (group->container || WARN_ON(group->container_users))
Alex Williamson's avatar
Alex Williamson committed
985 986
		return -EINVAL;

987
	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
988 989
		return -EPERM;

990 991
	f = fdget(container_fd);
	if (!f.file)
Alex Williamson's avatar
Alex Williamson committed
992 993 994
		return -EBADF;

	/* Sanity check, is this really our fd? */
995 996
	if (f.file->f_op != &vfio_fops) {
		fdput(f);
Alex Williamson's avatar
Alex Williamson committed
997 998 999
		return -EINVAL;
	}

1000
	container = f.file->private_data;
Alex Williamson's avatar
Alex Williamson committed
1001 1002
	WARN_ON(!container); /* fget ensures we don't race vfio_release */

1003
	down_write(&container->group_lock);
Alex Williamson's avatar
Alex Williamson committed
1004

1005 1006
	/* Real groups and fake groups cannot mix */
	if (!list_empty(&container->group_list) &&
1007
	    container->noiommu != (group->type == VFIO_NO_IOMMU)) {
1008 1009 1010 1011
		ret = -EPERM;
		goto unlock_out;
	}

1012 1013 1014 1015 1016
	if (group->type == VFIO_IOMMU) {
		ret = iommu_group_claim_dma_owner(group->iommu_group, f.file);
		if (ret)
			goto unlock_out;
	}
1017

Alex Williamson's avatar
Alex Williamson committed
1018 1019 1020
	driver = container->iommu_driver;
	if (driver) {
		ret = driver->ops->attach_group(container->iommu_data,
1021 1022
						group->iommu_group,
						group->type);
1023
		if (ret) {
1024 1025 1026
			if (group->type == VFIO_IOMMU)
				iommu_group_release_dma_owner(
					group->iommu_group);
Alex Williamson's avatar
Alex Williamson committed
1027
			goto unlock_out;
1028
		}
Alex Williamson's avatar
Alex Williamson committed
1029 1030 1031
	}

	group->container = container;
1032
	group->container_users = 1;
1033
	container->noiommu = (group->type == VFIO_NO_IOMMU);
Alex Williamson's avatar
Alex Williamson committed
1034 1035 1036 1037 1038 1039
	list_add(&group->container_next, &container->group_list);

	/* Get a reference on the container and mark a user within the group */
	vfio_container_get(container);

unlock_out:
1040
	up_write(&container->group_lock);
1041
	fdput(f);
Alex Williamson's avatar
Alex Williamson committed
1042 1043 1044 1045 1046
	return ret;
}

static const struct file_operations vfio_device_fops;

1047 1048
/* true if the vfio_device has open_device() called but not close_device() */
static bool vfio_assert_device_open(struct vfio_device *device)
1049
{
1050 1051 1052
	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
}

1053
static int vfio_device_assign_container(struct vfio_device *device)
Alex Williamson's avatar
Alex Williamson committed
1054
{
1055
	struct vfio_group *group = device->group;
Alex Williamson's avatar
Alex Williamson committed
1056

1057 1058
	lockdep_assert_held_write(&group->group_rwsem);

1059 1060
	if (!group->container || !group->container->iommu_driver ||
	    WARN_ON(!group->container_users))
1061 1062
		return -EINVAL;

1063
	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
1064 1065
		return -EPERM;

1066
	get_file(group->opened_file);
1067
	group->container_users++;
1068 1069 1070
	return 0;
}

1071 1072 1073
static void vfio_device_unassign_container(struct vfio_device *device)
{
	down_write(&device->group->group_rwsem);
1074 1075
	WARN_ON(device->group->container_users <= 1);
	device->group->container_users--;
1076 1077 1078
	fput(device->group->opened_file);
	up_write(&device->group->group_rwsem);
}
Alex Williamson's avatar
Alex Williamson committed
1079

1080
static struct file *vfio_device_open(struct vfio_device *device)
Alex Williamson's avatar
Alex Williamson committed
1081 1082
{
	struct file *filep;
1083
	int ret;
1084

1085
	down_write(&device->group->group_rwsem);
1086
	ret = vfio_device_assign_container(device);
1087
	up_write(&device->group->group_rwsem);
1088 1089
	if (ret)
		return ERR_PTR(ret);
Alex Williamson's avatar
Alex Williamson committed
1090

1091
	if (!try_module_get(device->dev->driver->owner)) {
1092
		ret = -ENODEV;
1093
		goto err_unassign_container;
1094 1095
	}

1096 1097
	mutex_lock(&device->dev_set->lock);
	device->open_count++;
1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112
	if (device->open_count == 1) {
		/*
		 * Here we pass the KVM pointer with the group under the read
		 * lock.  If the device driver will use it, it must obtain a
		 * reference and release it during close_device.
		 */
		down_read(&device->group->group_rwsem);
		device->kvm = device->group->kvm;

		if (device->ops->open_device) {
			ret = device->ops->open_device(device);
			if (ret)
				goto err_undo_count;
		}
		up_read(&device->group->group_rwsem);
1113 1114 1115
	}
	mutex_unlock(&device->dev_set->lock);

Alex Williamson's avatar
Alex Williamson committed
1116 1117 1118 1119 1120 1121 1122 1123
	/*
	 * We can't use anon_inode_getfd() because we need to modify
	 * the f_mode flags directly to allow more than just ioctls
	 */
	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
				   device, O_RDWR);
	if (IS_ERR(filep)) {
		ret = PTR_ERR(filep);
1124
		goto err_close_device;
Alex Williamson's avatar
Alex Williamson committed
1125 1126 1127 1128 1129 1130 1131 1132
	}

	/*
	 * TODO: add an anon_inode interface to do this.
	 * Appears to be missing by lack of need rather than
	 * explicitly prevented.  Now there's need.
	 */
	filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
Alex Williamson's avatar
Alex Williamson committed
1133

1134
	if (device->group->type == VFIO_NO_IOMMU)
1135 1136
		dev_warn(device->dev, "vfio-noiommu device opened by user "
			 "(%s:%d)\n", current->comm, task_pid_nr(current));
1137 1138 1139 1140 1141
	/*
	 * On success the ref of device is moved to the file and
	 * put in vfio_device_fops_release()
	 */
	return filep;
1142

1143 1144
err_close_device:
	mutex_lock(&device->dev_set->lock);
1145
	down_read(&device->group->group_rwsem);
1146 1147 1148 1149
	if (device->open_count == 1 && device->ops->close_device)
		device->ops->close_device(device);
err_undo_count:
	device->open_count--;
1150 1151 1152
	if (device->open_count == 0 && device->kvm)
		device->kvm = NULL;
	up_read(&device->group->group_rwsem);
1153 1154
	mutex_unlock(&device->dev_set->lock);
	module_put(device->dev->driver->owner);
1155
err_unassign_container:
1156
	vfio_device_unassign_container(device);
1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188
	return ERR_PTR(ret);
}

static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
{
	struct vfio_device *device;
	struct file *filep;
	int fdno;
	int ret;

	device = vfio_device_get_from_name(group, buf);
	if (IS_ERR(device))
		return PTR_ERR(device);

	fdno = get_unused_fd_flags(O_CLOEXEC);
	if (fdno < 0) {
		ret = fdno;
		goto err_put_device;
	}

	filep = vfio_device_open(device);
	if (IS_ERR(filep)) {
		ret = PTR_ERR(filep);
		goto err_put_fdno;
	}

	fd_install(fdno, filep);
	return fdno;

err_put_fdno:
	put_unused_fd(fdno);
err_put_device:
1189
	vfio_device_put(device);
Alex Williamson's avatar
Alex Williamson committed
1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214
	return ret;
}

static long vfio_group_fops_unl_ioctl(struct file *filep,
				      unsigned int cmd, unsigned long arg)
{
	struct vfio_group *group = filep->private_data;
	long ret = -ENOTTY;

	switch (cmd) {
	case VFIO_GROUP_GET_STATUS:
	{
		struct vfio_group_status status;
		unsigned long minsz;

		minsz = offsetofend(struct vfio_group_status, flags);

		if (copy_from_user(&status, (void __user *)arg, minsz))
			return -EFAULT;

		if (status.argsz < minsz)
			return -EINVAL;

		status.flags = 0;

1215
		down_read(&group->group_rwsem);
Alex Williamson's avatar
Alex Williamson committed
1216
		if (group->container)
1217 1218 1219 1220
			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
					VFIO_GROUP_FLAGS_VIABLE;
		else if (!iommu_group_dma_owner_claimed(group->iommu_group))
			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1221
		up_read(&group->group_rwsem);
Alex Williamson's avatar
Alex Williamson committed
1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238

		if (copy_to_user((void __user *)arg, &status, minsz))
			return -EFAULT;

		ret = 0;
		break;
	}
	case VFIO_GROUP_SET_CONTAINER:
	{
		int fd;

		if (get_user(fd, (int __user *)arg))
			return -EFAULT;

		if (fd < 0)
			return -EINVAL;

1239
		down_write(&group->group_rwsem);
Alex Williamson's avatar
Alex Williamson committed
1240
		ret = vfio_group_set_container(group, fd);
1241
		up_write(&group->group_rwsem);
Alex Williamson's avatar
Alex Williamson committed
1242 1243 1244
		break;
	}
	case VFIO_GROUP_UNSET_CONTAINER:
1245
		down_write(&group->group_rwsem);
Alex Williamson's avatar
Alex Williamson committed
1246
		ret = vfio_group_unset_container(group);
1247
		up_write(&group->group_rwsem);
Alex Williamson's avatar
Alex Williamson committed
1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267
		break;
	case VFIO_GROUP_GET_DEVICE_FD:
	{
		char *buf;

		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
		if (IS_ERR(buf))
			return PTR_ERR(buf);

		ret = vfio_group_get_device_fd(group, buf);
		kfree(buf);
		break;
	}
	}

	return ret;
}

static int vfio_group_fops_open(struct inode *inode, struct file *filep)
{
1268 1269
	struct vfio_group *group =
		container_of(inode->i_cdev, struct vfio_group, cdev);
1270
	int ret;
Alex Williamson's avatar
Alex Williamson committed
1271

1272
	down_write(&group->group_rwsem);
Alex Williamson's avatar
Alex Williamson committed
1273

1274 1275 1276 1277
	/* users can be zero if this races with vfio_group_put() */
	if (!refcount_inc_not_zero(&group->users)) {
		ret = -ENODEV;
		goto err_unlock;
1278 1279
	}

1280 1281 1282
	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
		ret = -EPERM;
		goto err_put;
Alex Williamson's avatar
Alex Williamson committed
1283 1284
	}

1285 1286 1287
	/*
	 * Do we need multiple instances of the group open?  Seems not.
	 */
1288
	if (group->opened_file) {
1289 1290
		ret = -EBUSY;
		goto err_put;
Alex Williamson's avatar
Alex Williamson committed
1291
	}
1292
	group->opened_file = filep;
Alex Williamson's avatar
Alex Williamson committed
1293 1294
	filep->private_data = group;

1295
	up_write(&group->group_rwsem);
Alex Williamson's avatar
Alex Williamson committed
1296
	return 0;
1297 1298 1299 1300 1301
err_put:
	vfio_group_put(group);
err_unlock:
	up_write(&group->group_rwsem);
	return ret;
Alex Williamson's avatar
Alex Williamson committed
1302 1303 1304 1305 1306 1307 1308 1309
}

static int vfio_group_fops_release(struct inode *inode, struct file *filep)
{
	struct vfio_group *group = filep->private_data;

	filep->private_data = NULL;

1310
	down_write(&group->group_rwsem);
1311 1312 1313 1314 1315 1316
	/*
	 * Device FDs hold a group file reference, therefore the group release
	 * is only called when there are no open devices.
	 */
	WARN_ON(group->notifier.head);
	if (group->container) {
1317
		WARN_ON(group->container_users != 1);
1318 1319 1320
		__vfio_group_unset_container(group);
	}
	group->opened_file = NULL;
1321
	up_write(&group->group_rwsem);
Alex Williamson's avatar
Alex Williamson committed
1322

Alex Williamson's avatar
Alex Williamson committed
1323 1324 1325 1326 1327 1328 1329 1330
	vfio_group_put(group);

	return 0;
}

static const struct file_operations vfio_group_fops = {
	.owner		= THIS_MODULE,
	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
1331
	.compat_ioctl	= compat_ptr_ioctl,
Alex Williamson's avatar
Alex Williamson committed
1332 1333 1334 1335
	.open		= vfio_group_fops_open,
	.release	= vfio_group_fops_release,
};

1336
/*
Alex Williamson's avatar
Alex Williamson committed
1337 1338 1339 1340 1341 1342
 * VFIO Device fd
 */
static int vfio_device_fops_release(struct inode *inode, struct file *filep)
{
	struct vfio_device *device = filep->private_data;

1343
	mutex_lock(&device->dev_set->lock);
1344
	vfio_assert_device_open(device);
1345
	down_read(&device->group->group_rwsem);
1346
	if (device->open_count == 1 && device->ops->close_device)
1347
		device->ops->close_device(device);
1348
	up_read(&device->group->group_rwsem);
1349
	device->open_count--;
1350 1351
	if (device->open_count == 0)
		device->kvm = NULL;
1352
	mutex_unlock(&device->dev_set->lock);
Alex Williamson's avatar
Alex Williamson committed
1353

1354 1355
	module_put(device->dev->driver->owner);

1356
	vfio_device_unassign_container(device);
Alex Williamson's avatar
Alex Williamson committed
1357 1358 1359 1360 1361 1362

	vfio_device_put(device);

	return 0;
}

1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382
/*
 * vfio_mig_get_next_state - Compute the next step in the FSM
 * @cur_fsm - The current state the device is in
 * @new_fsm - The target state to reach
 * @next_fsm - Pointer to the next step to get to new_fsm
 *
 * Return 0 upon success, otherwise -errno
 * Upon success the next step in the state progression between cur_fsm and
 * new_fsm will be set in next_fsm.
 *
 * This breaks down requests for combination transitions into smaller steps and
 * returns the next step to get to new_fsm. The function may need to be called
 * multiple times before reaching new_fsm.
 *
 */
int vfio_mig_get_next_state(struct vfio_device *device,
			    enum vfio_device_mig_state cur_fsm,
			    enum vfio_device_mig_state new_fsm,
			    enum vfio_device_mig_state *next_fsm)
{
1383
	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
1384
	/*
1385 1386
	 * The coding in this table requires the driver to implement the
	 * following FSM arcs:
1387 1388 1389 1390 1391
	 *         RESUMING -> STOP
	 *         STOP -> RESUMING
	 *         STOP -> STOP_COPY
	 *         STOP_COPY -> STOP
	 *
1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406
	 * If P2P is supported then the driver must also implement these FSM
	 * arcs:
	 *         RUNNING -> RUNNING_P2P
	 *         RUNNING_P2P -> RUNNING
	 *         RUNNING_P2P -> STOP
	 *         STOP -> RUNNING_P2P
	 * Without P2P the driver must implement:
	 *         RUNNING -> STOP
	 *         STOP -> RUNNING
	 *
	 * The coding will step through multiple states for some combination
	 * transitions; if all optional features are supported, this means the
	 * following ones:
	 *         RESUMING -> STOP -> RUNNING_P2P
	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
1407
	 *         RESUMING -> STOP -> STOP_COPY
1408 1409 1410 1411 1412 1413
	 *         RUNNING -> RUNNING_P2P -> STOP
	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
	 *         RUNNING_P2P -> STOP -> RESUMING
	 *         RUNNING_P2P -> STOP -> STOP_COPY
	 *         STOP -> RUNNING_P2P -> RUNNING
1414
	 *         STOP_COPY -> STOP -> RESUMING
1415 1416
	 *         STOP_COPY -> STOP -> RUNNING_P2P
	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
1417 1418 1419 1420
	 */
	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
		[VFIO_DEVICE_STATE_STOP] = {
			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1421
			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1422 1423
			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1424
			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1425 1426 1427
			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
		},
		[VFIO_DEVICE_STATE_RUNNING] = {
1428
			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
1429
			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1430 1431 1432
			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1433 1434 1435 1436 1437 1438 1439
			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
		},
		[VFIO_DEVICE_STATE_STOP_COPY] = {
			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1440
			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1441 1442 1443 1444 1445 1446 1447
			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
		},
		[VFIO_DEVICE_STATE_RESUMING] = {
			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1448 1449 1450 1451 1452 1453 1454 1455 1456
			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
		},
		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1457 1458 1459 1460 1461 1462 1463
			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
		},
		[VFIO_DEVICE_STATE_ERROR] = {
			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
1464
			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
1465 1466 1467 1468
			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
		},
	};

1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481
	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
		[VFIO_DEVICE_STATE_RUNNING_P2P] =
			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
		[VFIO_DEVICE_STATE_ERROR] = ~0U,
	};

	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
		    (state_flags_table[cur_fsm] & device->migration_flags) !=
			state_flags_table[cur_fsm]))
1482 1483
		return -EINVAL;

1484 1485 1486
	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
	   (state_flags_table[new_fsm] & device->migration_flags) !=
			state_flags_table[new_fsm])
1487 1488
		return -EINVAL;

1489 1490 1491 1492 1493
	/*
	 * Arcs touching optional and unsupported states are skipped over. The
	 * driver will instead see an arc from the original state to the next
	 * logical state, as per the above comment.
	 */
1494
	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
1495 1496 1497 1498
	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
			state_flags_table[*next_fsm])
		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];

1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587
	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
}
EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);

/*
 * Convert the drivers's struct file into a FD number and return it to userspace
 */
static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
				   struct vfio_device_feature_mig_state *mig)
{
	int ret;
	int fd;

	fd = get_unused_fd_flags(O_CLOEXEC);
	if (fd < 0) {
		ret = fd;
		goto out_fput;
	}

	mig->data_fd = fd;
	if (copy_to_user(arg, mig, sizeof(*mig))) {
		ret = -EFAULT;
		goto out_put_unused;
	}
	fd_install(fd, filp);
	return 0;

out_put_unused:
	put_unused_fd(fd);
out_fput:
	fput(filp);
	return ret;
}

static int
vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
					   u32 flags, void __user *arg,
					   size_t argsz)
{
	size_t minsz =
		offsetofend(struct vfio_device_feature_mig_state, data_fd);
	struct vfio_device_feature_mig_state mig;
	struct file *filp = NULL;
	int ret;

	if (!device->ops->migration_set_state ||
	    !device->ops->migration_get_state)
		return -ENOTTY;

	ret = vfio_check_feature(flags, argsz,
				 VFIO_DEVICE_FEATURE_SET |
				 VFIO_DEVICE_FEATURE_GET,
				 sizeof(mig));
	if (ret != 1)
		return ret;

	if (copy_from_user(&mig, arg, minsz))
		return -EFAULT;

	if (flags & VFIO_DEVICE_FEATURE_GET) {
		enum vfio_device_mig_state curr_state;

		ret = device->ops->migration_get_state(device, &curr_state);
		if (ret)
			return ret;
		mig.device_state = curr_state;
		goto out_copy;
	}

	/* Handle the VFIO_DEVICE_FEATURE_SET */
	filp = device->ops->migration_set_state(device, mig.device_state);
	if (IS_ERR(filp) || !filp)
		goto out_copy;

	return vfio_ioct_mig_return_fd(filp, arg, &mig);
out_copy:
	mig.data_fd = -1;
	if (copy_to_user(arg, &mig, sizeof(mig)))
		return -EFAULT;
	if (IS_ERR(filp))
		return PTR_ERR(filp);
	return 0;
}

static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
					       u32 flags, void __user *arg,
					       size_t argsz)
{
	struct vfio_device_feature_migration mig = {
1588
		.flags = device->migration_flags,
1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604
	};
	int ret;

	if (!device->ops->migration_set_state ||
	    !device->ops->migration_get_state)
		return -ENOTTY;

	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
				 sizeof(mig));
	if (ret != 1)
		return ret;
	if (copy_to_user(arg, &mig, sizeof(mig)))
		return -EFAULT;
	return 0;
}

1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629
static int vfio_ioctl_device_feature(struct vfio_device *device,
				     struct vfio_device_feature __user *arg)
{
	size_t minsz = offsetofend(struct vfio_device_feature, flags);
	struct vfio_device_feature feature;

	if (copy_from_user(&feature, arg, minsz))
		return -EFAULT;

	if (feature.argsz < minsz)
		return -EINVAL;

	/* Check unknown flags */
	if (feature.flags &
	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
		return -EINVAL;

	/* GET & SET are mutually exclusive except with PROBE */
	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
		return -EINVAL;

	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1630 1631 1632 1633 1634 1635 1636 1637
	case VFIO_DEVICE_FEATURE_MIGRATION:
		return vfio_ioctl_device_feature_migration(
			device, feature.flags, arg->data,
			feature.argsz - minsz);
	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
		return vfio_ioctl_device_feature_mig_device_state(
			device, feature.flags, arg->data,
			feature.argsz - minsz);
1638 1639 1640 1641 1642 1643 1644 1645 1646
	default:
		if (unlikely(!device->ops->device_feature))
			return -EINVAL;
		return device->ops->device_feature(device, feature.flags,
						   arg->data,
						   feature.argsz - minsz);
	}
}

Alex Williamson's avatar
Alex Williamson committed
1647 1648 1649 1650 1651
static long vfio_device_fops_unl_ioctl(struct file *filep,
				       unsigned int cmd, unsigned long arg)
{
	struct vfio_device *device = filep->private_data;

1652 1653 1654 1655 1656 1657 1658 1659
	switch (cmd) {
	case VFIO_DEVICE_FEATURE:
		return vfio_ioctl_device_feature(device, (void __user *)arg);
	default:
		if (unlikely(!device->ops->ioctl))
			return -EINVAL;
		return device->ops->ioctl(device, cmd, arg);
	}
Alex Williamson's avatar
Alex Williamson committed
1660 1661 1662 1663 1664 1665 1666 1667 1668 1669
}

static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
				     size_t count, loff_t *ppos)
{
	struct vfio_device *device = filep->private_data;

	if (unlikely(!device->ops->read))
		return -EINVAL;

1670
	return device->ops->read(device, buf, count, ppos);
Alex Williamson's avatar
Alex Williamson committed
1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681
}

static ssize_t vfio_device_fops_write(struct file *filep,
				      const char __user *buf,
				      size_t count, loff_t *ppos)
{
	struct vfio_device *device = filep->private_data;

	if (unlikely(!device->ops->write))
		return -EINVAL;

1682
	return device->ops->write(device, buf, count, ppos);
Alex Williamson's avatar
Alex Williamson committed
1683 1684 1685 1686 1687 1688 1689 1690 1691
}

static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
{
	struct vfio_device *device = filep->private_data;

	if (unlikely(!device->ops->mmap))
		return -EINVAL;

1692
	return device->ops->mmap(device, vma);
Alex Williamson's avatar
Alex Williamson committed
1693 1694 1695 1696 1697 1698 1699 1700
}

static const struct file_operations vfio_device_fops = {
	.owner		= THIS_MODULE,
	.release	= vfio_device_fops_release,
	.read		= vfio_device_fops_read,
	.write		= vfio_device_fops_write,
	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1701
	.compat_ioctl	= compat_ptr_ioctl,
Alex Williamson's avatar
Alex Williamson committed
1702 1703 1704
	.mmap		= vfio_device_fops_mmap,
};

1705 1706 1707
/**
 * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
 * @file: VFIO group file
1708
 *
1709
 * The returned iommu_group is valid as long as a ref is held on the file.
1710
 */
1711
struct iommu_group *vfio_file_iommu_group(struct file *file)
1712
{
1713
	struct vfio_group *group = file->private_data;
1714

1715 1716 1717
	if (file->f_op != &vfio_group_fops)
		return NULL;
	return group->iommu_group;
1718
}
1719
EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
1720

1721 1722 1723 1724
/**
 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
 *        is always CPU cache coherent
 * @file: VFIO group file
1725
 *
1726 1727 1728
 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
 * bit in DMA transactions. A return of false indicates that the user has
 * rights to access additional instructions such as wbinvd on x86.
1729
 */
1730
bool vfio_file_enforced_coherent(struct file *file)
1731
{
1732 1733
	struct vfio_group *group = file->private_data;
	bool ret;
1734

1735 1736
	if (file->f_op != &vfio_group_fops)
		return true;
1737

1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748
	down_read(&group->group_rwsem);
	if (group->container) {
		ret = vfio_ioctl_check_extension(group->container,
						 VFIO_DMA_CC_IOMMU);
	} else {
		/*
		 * Since the coherency state is determined only once a container
		 * is attached the user must do so before they can prove they
		 * have permission.
		 */
		ret = true;
1749
	}
1750
	up_read(&group->group_rwsem);
1751
	return ret;
1752
}
1753
EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1754

1755 1756 1757 1758 1759
/**
 * vfio_file_set_kvm - Link a kvm with VFIO drivers
 * @file: VFIO group file
 * @kvm: KVM to link
 *
1760 1761
 * When a VFIO device is first opened the KVM will be available in
 * device->kvm if one was associated with the group.
1762 1763
 */
void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1764
{
1765
	struct vfio_group *group = file->private_data;
1766

1767 1768
	if (file->f_op != &vfio_group_fops)
		return;
1769

1770
	down_write(&group->group_rwsem);
1771
	group->kvm = kvm;
1772
	up_write(&group->group_rwsem);
1773
}
1774
EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1775

1776 1777 1778 1779 1780 1781 1782 1783
/**
 * vfio_file_has_dev - True if the VFIO file is a handle for device
 * @file: VFIO file to check
 * @device: Device that must be part of the file
 *
 * Returns true if given file has permission to manipulate the given device.
 */
bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
1784
{
1785
	struct vfio_group *group = file->private_data;
1786

1787 1788 1789 1790
	if (file->f_op != &vfio_group_fops)
		return false;

	return group == device->group;
1791
}
1792
EXPORT_SYMBOL_GPL(vfio_file_has_dev);
1793

1794
/*
1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814
 * Sub-module support
 */
/*
 * Helper for managing a buffer of info chain capabilities, allocate or
 * reallocate a buffer with additional @size, filling in @id and @version
 * of the capability.  A pointer to the new capability is returned.
 *
 * NB. The chain is based at the head of the buffer, so new entries are
 * added to the tail, vfio_info_cap_shift() should be called to fixup the
 * next offsets prior to copying to the user buffer.
 */
struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
					       size_t size, u16 id, u16 version)
{
	void *buf;
	struct vfio_info_cap_header *header, *tmp;

	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
	if (!buf) {
		kfree(caps->buf);
1815
		caps->buf = NULL;
1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829
		caps->size = 0;
		return ERR_PTR(-ENOMEM);
	}

	caps->buf = buf;
	header = buf + caps->size;

	/* Eventually copied to user buffer, zero */
	memset(header, 0, size);

	header->id = id;
	header->version = version;

	/* Add to the end of the capability chain */
1830
	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842
		; /* nothing */

	tmp->next = caps->size;
	caps->size += size;

	return header;
}
EXPORT_SYMBOL_GPL(vfio_info_cap_add);

void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
{
	struct vfio_info_cap_header *tmp;
1843
	void *buf = (void *)caps->buf;
1844

1845
	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1846 1847
		tmp->next += offset;
}
1848
EXPORT_SYMBOL(vfio_info_cap_shift);
1849

1850 1851
int vfio_info_add_capability(struct vfio_info_cap *caps,
			     struct vfio_info_cap_header *cap, size_t size)
1852 1853 1854
{
	struct vfio_info_cap_header *header;

1855
	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1856 1857 1858
	if (IS_ERR(header))
		return PTR_ERR(header);

1859
	memcpy(header + 1, cap + 1, size - sizeof(*header));
1860 1861 1862 1863

	return 0;
}
EXPORT_SYMBOL(vfio_info_add_capability);
1864

1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912
int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
				       int max_irq_type, size_t *data_size)
{
	unsigned long minsz;
	size_t size;

	minsz = offsetofend(struct vfio_irq_set, count);

	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
	    (hdr->count >= (U32_MAX - hdr->start)) ||
	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
		return -EINVAL;

	if (data_size)
		*data_size = 0;

	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
		return -EINVAL;

	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
	case VFIO_IRQ_SET_DATA_NONE:
		size = 0;
		break;
	case VFIO_IRQ_SET_DATA_BOOL:
		size = sizeof(uint8_t);
		break;
	case VFIO_IRQ_SET_DATA_EVENTFD:
		size = sizeof(int32_t);
		break;
	default:
		return -EINVAL;
	}

	if (size) {
		if (hdr->argsz - minsz < hdr->count * size)
			return -EINVAL;

		if (!data_size)
			return -EINVAL;

		*data_size = hdr->count * size;
	}

	return 0;
}
EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);

1913 1914 1915
/*
 * Pin a set of guest PFNs and return their associated host PFNs for local
 * domain only.
1916
 * @device [in]  : device
1917
 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1918 1919 1920 1921 1922 1923
 * @npage [in]   : count of elements in user_pfn array.  This count should not
 *		   be greater VFIO_PIN_PAGES_MAX_ENTRIES.
 * @prot [in]    : protection flags
 * @phys_pfn[out]: array of host PFNs
 * Return error or number of pages pinned.
 */
1924 1925
int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
		   int npage, int prot, unsigned long *phys_pfn)
1926 1927
{
	struct vfio_container *container;
1928
	struct vfio_group *group = device->group;
1929 1930 1931
	struct vfio_iommu_driver *driver;
	int ret;

1932 1933
	if (!user_pfn || !phys_pfn || !npage ||
	    !vfio_assert_device_open(device))
1934 1935 1936 1937 1938
		return -EINVAL;

	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
		return -E2BIG;

1939 1940
	if (group->dev_counter > 1)
		return -EINVAL;
1941

1942
	/* group->container cannot change while a vfio device is open */
1943 1944 1945
	container = group->container;
	driver = container->iommu_driver;
	if (likely(driver && driver->ops->pin_pages))
1946 1947
		ret = driver->ops->pin_pages(container->iommu_data,
					     group->iommu_group, user_pfn,
1948 1949 1950 1951 1952 1953 1954 1955 1956 1957
					     npage, prot, phys_pfn);
	else
		ret = -ENOTTY;

	return ret;
}
EXPORT_SYMBOL(vfio_pin_pages);

/*
 * Unpin set of host PFNs for local domain only.
1958
 * @device [in]  : device
1959 1960 1961 1962 1963 1964
 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
 *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
 * @npage [in]   : count of elements in user_pfn array.  This count should not
 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
 * Return error or number of pages unpinned.
 */
1965 1966
int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
		     int npage)
1967 1968 1969 1970 1971
{
	struct vfio_container *container;
	struct vfio_iommu_driver *driver;
	int ret;

1972
	if (!user_pfn || !npage || !vfio_assert_device_open(device))
1973 1974 1975 1976 1977
		return -EINVAL;

	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
		return -E2BIG;

1978
	/* group->container cannot change while a vfio device is open */
1979
	container = device->group->container;
1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990
	driver = container->iommu_driver;
	if (likely(driver && driver->ops->unpin_pages))
		ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
					       npage);
	else
		ret = -ENOTTY;

	return ret;
}
EXPORT_SYMBOL(vfio_unpin_pages);

1991 1992 1993 1994 1995 1996 1997 1998 1999 2000
/*
 * This interface allows the CPUs to perform some sort of virtual DMA on
 * behalf of the device.
 *
 * CPUs read/write from/into a range of IOVAs pointing to user space memory
 * into/from a kernel buffer.
 *
 * As the read/write of user space memory is conducted via the CPUs and is
 * not a real device DMA, it is not necessary to pin the user space memory.
 *
2001
 * @device [in]		: VFIO device
2002 2003 2004 2005 2006 2007
 * @user_iova [in]	: base IOVA of a user space buffer
 * @data [in]		: pointer to kernel buffer
 * @len [in]		: kernel buffer length
 * @write		: indicate read or write
 * Return error code on failure or 0 on success.
 */
2008 2009
int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data,
		size_t len, bool write)
2010 2011 2012 2013 2014
{
	struct vfio_container *container;
	struct vfio_iommu_driver *driver;
	int ret = 0;

2015
	if (!data || len <= 0 || !vfio_assert_device_open(device))
2016 2017
		return -EINVAL;

2018
	/* group->container cannot change while a vfio device is open */
2019
	container = device->group->container;
2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030
	driver = container->iommu_driver;

	if (likely(driver && driver->ops->dma_rw))
		ret = driver->ops->dma_rw(container->iommu_data,
					  user_iova, data, len, write);
	else
		ret = -ENOTTY;
	return ret;
}
EXPORT_SYMBOL(vfio_dma_rw);

2031 2032 2033
static int vfio_register_iommu_notifier(struct vfio_group *group,
					unsigned long *events,
					struct notifier_block *nb)
2034 2035 2036 2037 2038
{
	struct vfio_container *container;
	struct vfio_iommu_driver *driver;
	int ret;

2039
	lockdep_assert_held_read(&group->group_rwsem);
2040 2041 2042 2043

	container = group->container;
	driver = container->iommu_driver;
	if (likely(driver && driver->ops->register_notifier))
2044 2045
		ret = driver->ops->register_notifier(container->iommu_data,
						     events, nb);
2046 2047 2048 2049 2050 2051
	else
		ret = -ENOTTY;

	return ret;
}

2052 2053
static int vfio_unregister_iommu_notifier(struct vfio_group *group,
					  struct notifier_block *nb)
2054 2055 2056 2057 2058
{
	struct vfio_container *container;
	struct vfio_iommu_driver *driver;
	int ret;

2059
	lockdep_assert_held_read(&group->group_rwsem);
2060 2061 2062 2063 2064 2065 2066 2067 2068

	container = group->container;
	driver = container->iommu_driver;
	if (likely(driver && driver->ops->unregister_notifier))
		ret = driver->ops->unregister_notifier(container->iommu_data,
						       nb);
	else
		ret = -ENOTTY;

2069 2070 2071
	return ret;
}

2072 2073 2074
int vfio_register_notifier(struct vfio_device *device,
			   enum vfio_notify_type type, unsigned long *events,
			   struct notifier_block *nb)
2075
{
2076
	struct vfio_group *group = device->group;
2077 2078
	int ret;

2079 2080
	if (!nb || !events || (*events == 0) ||
	    !vfio_assert_device_open(device))
2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093
		return -EINVAL;

	switch (type) {
	case VFIO_IOMMU_NOTIFY:
		ret = vfio_register_iommu_notifier(group, events, nb);
		break;
	default:
		ret = -EINVAL;
	}
	return ret;
}
EXPORT_SYMBOL(vfio_register_notifier);

2094 2095
int vfio_unregister_notifier(struct vfio_device *device,
			     enum vfio_notify_type type,
2096 2097
			     struct notifier_block *nb)
{
2098
	struct vfio_group *group = device->group;
2099 2100
	int ret;

2101
	if (!nb || !vfio_assert_device_open(device))
2102 2103 2104 2105 2106 2107 2108 2109 2110
		return -EINVAL;

	switch (type) {
	case VFIO_IOMMU_NOTIFY:
		ret = vfio_unregister_iommu_notifier(group, nb);
		break;
	default:
		ret = -EINVAL;
	}
2111 2112 2113 2114
	return ret;
}
EXPORT_SYMBOL(vfio_unregister_notifier);

2115
/*
Alex Williamson's avatar
Alex Williamson committed
2116 2117 2118 2119 2120 2121 2122
 * Module/class support
 */
static char *vfio_devnode(struct device *dev, umode_t *mode)
{
	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
}

2123 2124 2125 2126 2127 2128 2129 2130
static struct miscdevice vfio_dev = {
	.minor = VFIO_MINOR,
	.name = "vfio",
	.fops = &vfio_fops,
	.nodename = "vfio/vfio",
	.mode = S_IRUGO | S_IWUGO,
};

Alex Williamson's avatar
Alex Williamson committed
2131 2132 2133 2134
static int __init vfio_init(void)
{
	int ret;

2135
	ida_init(&vfio.group_ida);
Alex Williamson's avatar
Alex Williamson committed
2136 2137 2138 2139 2140
	mutex_init(&vfio.group_lock);
	mutex_init(&vfio.iommu_drivers_lock);
	INIT_LIST_HEAD(&vfio.group_list);
	INIT_LIST_HEAD(&vfio.iommu_drivers_list);

2141 2142 2143 2144 2145 2146 2147
	ret = misc_register(&vfio_dev);
	if (ret) {
		pr_err("vfio: misc device register failed\n");
		return ret;
	}

	/* /dev/vfio/$GROUP */
Alex Williamson's avatar
Alex Williamson committed
2148 2149 2150 2151 2152 2153 2154 2155
	vfio.class = class_create(THIS_MODULE, "vfio");
	if (IS_ERR(vfio.class)) {
		ret = PTR_ERR(vfio.class);
		goto err_class;
	}

	vfio.class->devnode = vfio_devnode;

2156
	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
Alex Williamson's avatar
Alex Williamson committed
2157
	if (ret)
2158
		goto err_alloc_chrdev;
Alex Williamson's avatar
Alex Williamson committed
2159 2160 2161

	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");

2162 2163 2164
#ifdef CONFIG_VFIO_NOIOMMU
	vfio_register_iommu_driver(&vfio_noiommu_ops);
#endif
Alex Williamson's avatar
Alex Williamson committed
2165 2166
	return 0;

2167
err_alloc_chrdev:
Alex Williamson's avatar
Alex Williamson committed
2168 2169 2170
	class_destroy(vfio.class);
	vfio.class = NULL;
err_class:
2171
	misc_deregister(&vfio_dev);
Alex Williamson's avatar
Alex Williamson committed
2172 2173 2174 2175 2176 2177 2178
	return ret;
}

static void __exit vfio_cleanup(void)
{
	WARN_ON(!list_empty(&vfio.group_list));

2179 2180 2181
#ifdef CONFIG_VFIO_NOIOMMU
	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
#endif
2182
	ida_destroy(&vfio.group_ida);
2183
	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
Alex Williamson's avatar
Alex Williamson committed
2184 2185
	class_destroy(vfio.class);
	vfio.class = NULL;
2186
	misc_deregister(&vfio_dev);
2187
	xa_destroy(&vfio_device_set_xa);
Alex Williamson's avatar
Alex Williamson committed
2188 2189 2190 2191 2192 2193 2194 2195 2196
}

module_init(vfio_init);
module_exit(vfio_cleanup);

MODULE_VERSION(DRIVER_VERSION);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(DRIVER_AUTHOR);
MODULE_DESCRIPTION(DRIVER_DESC);
2197 2198
MODULE_ALIAS_MISCDEV(VFIO_MINOR);
MODULE_ALIAS("devname:vfio/vfio");
2199
MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");