Commit d624d665 authored by Jason Gunthorpe's avatar Jason Gunthorpe

iommufd: vfio container FD ioctl compatibility

iommufd can directly implement the /dev/vfio/vfio container IOCTLs by
mapping them into io_pagetable operations.

A userspace application can test against iommufd and confirm compatibility
then simply make a small change to open /dev/iommu instead of
/dev/vfio/vfio.

For testing purposes /dev/vfio/vfio can be symlinked to /dev/iommu and
then all applications will use the compatibility path with no code
changes. A later series allows /dev/vfio/vfio to be directly provided by
iommufd, which allows the rlimit mode to work the same as well.

This series just provides the iommufd side of compatibility. Actually
linking this to VFIO_SET_CONTAINER is a followup series, with a link in
the cover letter.

Internally the compatibility API uses a normal IOAS object that, like
vfio, is automatically allocated when the first device is
attached.

Userspace can also query or set this IOAS object directly using the
IOMMU_VFIO_IOAS ioctl. This allows mixing and matching new iommufd only
features while still using the VFIO style map/unmap ioctls.

While this is enough to operate qemu, it has a few differences:

 - Resource limits rely on memory cgroups to bound what userspace can do
   instead of the module parameter dma_entry_limit.

 - VFIO P2P is not implemented. The DMABUF patches for vfio are a start at
   a solution where iommufd would import a special DMABUF. This is to avoid
   further propogating the follow_pfn() security problem.

 - A full audit for pedantic compatibility details (eg errnos, etc) has
   not yet been done

 - powerpc SPAPR is left out, as it is not connected to the iommu_domain
   framework. It seems interest in SPAPR is minimal as it is currently
   non-working in v6.1-rc1. They will have to convert to the iommu
   subsystem framework to enjoy iommfd.

The following are not going to be implemented and we expect to remove them
from VFIO type1:

 - SW access 'dirty tracking'. As discussed in the cover letter this will
   be done in VFIO.

 - VFIO_TYPE1_NESTING_IOMMU
    https://lore.kernel.org/all/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/

 - VFIO_DMA_MAP_FLAG_VADDR
    https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/

Link: https://lore.kernel.org/r/15-v6-a196d26f289e+11787-iommufd_jgg@nvidia.comTested-by: default avatarNicolin Chen <nicolinc@nvidia.com>
Tested-by: default avatarYi Liu <yi.l.liu@intel.com>
Tested-by: default avatarLixiao Yang <lixiao.yang@intel.com>
Tested-by: default avatarMatthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: default avatarKevin Tian <kevin.tian@intel.com>
Reviewed-by: default avatarEric Auger <eric.auger@redhat.com>
Signed-off-by: default avatarNicolin Chen <nicolinc@nvidia.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@nvidia.com>
parent 8d40205f
......@@ -5,6 +5,7 @@ iommufd-y := \
io_pagetable.o \
ioas.o \
main.o \
pages.o
pages.o \
vfio_compat.o
obj-$(CONFIG_IOMMUFD) += iommufd.o
......@@ -18,6 +18,7 @@ struct iommufd_ctx {
struct xarray objects;
u8 account_mode;
struct iommufd_ioas *vfio_ioas;
};
/*
......@@ -92,6 +93,9 @@ struct iommufd_ucmd {
void *cmd;
};
int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
unsigned long arg);
/* Copy the response in ucmd->cmd back to userspace. */
static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
size_t cmd_len)
......@@ -222,6 +226,8 @@ int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
int iommufd_option_rlimit_mode(struct iommu_option *cmd,
struct iommufd_ctx *ictx);
int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
/*
* A HW pagetable is called an iommu_domain inside the kernel. This user object
* allows directly creating and inspecting the domains. Domains that have kernel
......
......@@ -133,6 +133,8 @@ bool iommufd_object_destroy_user(struct iommufd_ctx *ictx,
return false;
}
__xa_erase(&ictx->objects, obj->id);
if (ictx->vfio_ioas && &ictx->vfio_ioas->obj == obj)
ictx->vfio_ioas = NULL;
xa_unlock(&ictx->objects);
up_write(&obj->destroy_rwsem);
......@@ -271,27 +273,31 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
length),
IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option,
val64),
IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
__reserved),
};
static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
struct iommufd_ctx *ictx = filp->private_data;
const struct iommufd_ioctl_op *op;
struct iommufd_ucmd ucmd = {};
union ucmd_buffer buf;
unsigned int nr;
int ret;
ucmd.ictx = filp->private_data;
nr = _IOC_NR(cmd);
if (nr < IOMMUFD_CMD_BASE ||
(nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops))
return iommufd_vfio_ioctl(ictx, cmd, arg);
ucmd.ictx = ictx;
ucmd.ubuffer = (void __user *)arg;
ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
if (ret)
return ret;
nr = _IOC_NR(cmd);
if (nr < IOMMUFD_CMD_BASE ||
(nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops))
return -ENOIOCTLCMD;
op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE];
if (op->ioctl_num != cmd)
return -ENOIOCTLCMD;
......
This diff is collapsed.
......@@ -54,6 +54,7 @@ void iommufd_access_unpin_pages(struct iommufd_access *access,
unsigned long iova, unsigned long length);
int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
void *data, size_t len, unsigned int flags);
int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id);
#else /* !CONFIG_IOMMUFD */
static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file)
{
......@@ -84,5 +85,11 @@ static inline int iommufd_access_rw(struct iommufd_access *access, unsigned long
{
return -EOPNOTSUPP;
}
static inline int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx,
u32 *out_ioas_id)
{
return -EOPNOTSUPP;
}
#endif /* CONFIG_IOMMUFD */
#endif
......@@ -44,6 +44,7 @@ enum {
IOMMUFD_CMD_IOAS_MAP,
IOMMUFD_CMD_IOAS_UNMAP,
IOMMUFD_CMD_OPTION,
IOMMUFD_CMD_VFIO_IOAS,
};
/**
......@@ -308,4 +309,39 @@ struct iommu_option {
__aligned_u64 val64;
};
#define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
/**
* enum iommufd_vfio_ioas_op - IOMMU_VFIO_IOAS_* ioctls
* @IOMMU_VFIO_IOAS_GET: Get the current compatibility IOAS
* @IOMMU_VFIO_IOAS_SET: Change the current compatibility IOAS
* @IOMMU_VFIO_IOAS_CLEAR: Disable VFIO compatibility
*/
enum iommufd_vfio_ioas_op {
IOMMU_VFIO_IOAS_GET = 0,
IOMMU_VFIO_IOAS_SET = 1,
IOMMU_VFIO_IOAS_CLEAR = 2,
};
/**
* struct iommu_vfio_ioas - ioctl(IOMMU_VFIO_IOAS)
* @size: sizeof(struct iommu_vfio_ioas)
* @ioas_id: For IOMMU_VFIO_IOAS_SET the input IOAS ID to set
* For IOMMU_VFIO_IOAS_GET will output the IOAS ID
* @op: One of enum iommufd_vfio_ioas_op
* @__reserved: Must be 0
*
* The VFIO compatibility support uses a single ioas because VFIO APIs do not
* support the ID field. Set or Get the IOAS that VFIO compatibility will use.
* When VFIO_GROUP_SET_CONTAINER is used on an iommufd it will get the
* compatibility ioas, either by taking what is already set, or auto creating
* one. From then on VFIO will continue to use that ioas and is not effected by
* this ioctl. SET or CLEAR does not destroy any auto-created IOAS.
*/
struct iommu_vfio_ioas {
__u32 size;
__u32 ioas_id;
__u16 op;
__u16 __reserved;
};
#define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS)
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment