Commit 463f46e1 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd

Pull iommufd updates from Jason Gunthorpe:
 "This brings three new iommufd capabilities:

   - Dirty tracking for DMA.

     AMD/ARM/Intel CPUs can now record if a DMA writes to a page in the
     IOPTEs within the IO page table. This can be used to generate a
     record of what memory is being dirtied by DMA activities during a
     VM migration process. A VMM like qemu will combine the IOMMU dirty
     bits with the CPU's dirty log to determine what memory to transfer.

     VFIO already has a DMA dirty tracking framework that requires PCI
     devices to implement tracking HW internally. The iommufd version
     provides an alternative that the VMM can select, if available. The
     two are designed to have very similar APIs.

   - Userspace controlled attributes for hardware page tables
     (HWPT/iommu_domain). There are currently a few generic attributes
     for HWPTs (support dirty tracking, and parent of a nest). This is
     an entry point for the userspace iommu driver to control the HW in
     detail.

   - Nested translation support for HWPTs. This is a 2D translation
     scheme similar to the CPU where a DMA goes through a first stage to
     determine an intermediate address which is then translated trough a
     second stage to a physical address.

     Like for CPU translation the first stage table would exist in VM
     controlled memory and the second stage is in the kernel and matches
     the VM's guest to physical map.

     As every IOMMU has a unique set of parameter to describe the S1 IO
     page table and its associated parameters the userspace IOMMU driver
     has to marshal the information into the correct format.

     This is 1/3 of the feature, it allows creating the nested
     translation and binding it to VFIO devices, however the API to
     support IOTLB and ATC invalidation of the stage 1 io page table,
     and forwarding of IO faults are still in progress.

  The series includes AMD and Intel support for dirty tracking. Intel
  support for nested translation.

  Along the way are a number of internal items:

   - New iommu core items: ops->domain_alloc_user(),
     ops->set_dirty_tracking, ops->read_and_clear_dirty(),
     IOMMU_DOMAIN_NESTED, and iommu_copy_struct_from_user

   - UAF fix in iopt_area_split()

   - Spelling fixes and some test suite improvement"

* tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd: (52 commits)
  iommufd: Organize the mock domain alloc functions closer to Joerg's tree
  iommufd/selftest: Fix page-size check in iommufd_test_dirty()
  iommufd: Add iopt_area_alloc()
  iommufd: Fix missing update of domains_itree after splitting iopt_area
  iommu/vt-d: Disallow read-only mappings to nest parent domain
  iommu/vt-d: Add nested domain allocation
  iommu/vt-d: Set the nested domain to a device
  iommu/vt-d: Make domain attach helpers to be extern
  iommu/vt-d: Add helper to setup pasid nested translation
  iommu/vt-d: Add helper for nested domain allocation
  iommu/vt-d: Extend dmar_domain to support nested domain
  iommufd: Add data structure for Intel VT-d stage-1 domain allocation
  iommu/vt-d: Enhance capability check for nested parent domain allocation
  iommufd/selftest: Add coverage for IOMMU_HWPT_ALLOC with nested HWPTs
  iommufd/selftest: Add nested domain allocation for mock domain
  iommu: Add iommu_copy_struct_from_user helper
  iommufd: Add a nested HW pagetable object
  iommu: Pass in parent domain with user_data to domain_alloc_user op
  iommufd: Share iommufd_hwpt_alloc with IOMMUFD_OBJ_HWPT_NESTED
  iommufd: Derive iommufd_hwpt_paging from iommufd_hw_pagetable
  ...
parents ff269e2c b2b67c99
......@@ -7,6 +7,10 @@ config IOMMU_IOVA
config IOMMU_API
bool
config IOMMUFD_DRIVER
bool
default n
menuconfig IOMMU_SUPPORT
bool "IOMMU Hardware Support"
depends on MMU
......
......@@ -10,6 +10,7 @@ config AMD_IOMMU
select IOMMU_API
select IOMMU_IOVA
select IOMMU_IO_PGTABLE
select IOMMUFD_DRIVER if IOMMUFD
depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
help
With this option you can enable support for AMD IOMMU hardware in
......
......@@ -97,7 +97,9 @@
#define FEATURE_GATS_MASK (3ULL)
#define FEATURE_GAM_VAPIC BIT_ULL(21)
#define FEATURE_GIOSUP BIT_ULL(48)
#define FEATURE_HASUP BIT_ULL(49)
#define FEATURE_EPHSUP BIT_ULL(50)
#define FEATURE_HDSUP BIT_ULL(52)
#define FEATURE_SNP BIT_ULL(63)
#define FEATURE_PASID_SHIFT 32
......@@ -212,6 +214,7 @@
/* macros and definitions for device table entries */
#define DEV_ENTRY_VALID 0x00
#define DEV_ENTRY_TRANSLATION 0x01
#define DEV_ENTRY_HAD 0x07
#define DEV_ENTRY_PPR 0x34
#define DEV_ENTRY_IR 0x3d
#define DEV_ENTRY_IW 0x3e
......@@ -370,10 +373,16 @@
#define PTE_LEVEL_PAGE_SIZE(level) \
(1ULL << (12 + (9 * (level))))
/*
* The IOPTE dirty bit
*/
#define IOMMU_PTE_HD_BIT (6)
/*
* Bit value definition for I/O PTE fields
*/
#define IOMMU_PTE_PR BIT_ULL(0)
#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT)
#define IOMMU_PTE_U BIT_ULL(59)
#define IOMMU_PTE_FC BIT_ULL(60)
#define IOMMU_PTE_IR BIT_ULL(61)
......@@ -384,6 +393,7 @@
*/
#define DTE_FLAG_V BIT_ULL(0)
#define DTE_FLAG_TV BIT_ULL(1)
#define DTE_FLAG_HAD (3ULL << 7)
#define DTE_FLAG_GIOV BIT_ULL(54)
#define DTE_FLAG_GV BIT_ULL(55)
#define DTE_GLX_SHIFT (56)
......@@ -413,6 +423,7 @@
#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD)
#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
......@@ -563,6 +574,7 @@ struct protection_domain {
int nid; /* Node ID */
u64 *gcr3_tbl; /* Guest CR3 table */
unsigned long flags; /* flags to find out type of domain */
bool dirty_tracking; /* dirty tracking is enabled in the domain */
unsigned dev_cnt; /* devices assigned to this domain */
unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
};
......
......@@ -486,6 +486,73 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo
return (__pte & ~offset_mask) | (iova & offset_mask);
}
static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
unsigned long flags)
{
bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
bool dirty = false;
int i, count;
/*
* 2.2.3.2 Host Dirty Support
* When a non-default page size is used , software must OR the
* Dirty bits in all of the replicated host PTEs used to map
* the page. The IOMMU does not guarantee the Dirty bits are
* set in all of the replicated PTEs. Any portion of the page
* may have been written even if the Dirty bit is set in only
* one of the replicated PTEs.
*/
count = PAGE_SIZE_PTE_COUNT(size);
for (i = 0; i < count && test_only; i++) {
if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
dirty = true;
break;
}
}
for (i = 0; i < count && !test_only; i++) {
if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
(unsigned long *)&ptep[i])) {
dirty = true;
}
}
return dirty;
}
static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
unsigned long iova, size_t size,
unsigned long flags,
struct iommu_dirty_bitmap *dirty)
{
struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
unsigned long end = iova + size - 1;
do {
unsigned long pgsize = 0;
u64 *ptep, pte;
ptep = fetch_pte(pgtable, iova, &pgsize);
if (ptep)
pte = READ_ONCE(*ptep);
if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
iova += pgsize;
continue;
}
/*
* Mark the whole IOVA range as dirty even if only one of
* the replicated PTEs were marked dirty.
*/
if (pte_test_and_clear_dirty(ptep, pgsize, flags))
iommu_dirty_bitmap_record(dirty, iova, pgsize);
iova += pgsize;
} while (iova < end);
return 0;
}
/*
* ----------------------------------------------------
*/
......@@ -527,6 +594,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo
pgtable->iop.ops.map_pages = iommu_v1_map_pages;
pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages;
pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys;
pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
return &pgtable->iop;
}
......
......@@ -37,6 +37,7 @@
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/dma.h>
#include <uapi/linux/iommufd.h>
#include "amd_iommu.h"
#include "../dma-iommu.h"
......@@ -65,6 +66,7 @@ LIST_HEAD(hpet_map);
LIST_HEAD(acpihid_map);
const struct iommu_ops amd_iommu_ops;
const struct iommu_dirty_ops amd_dirty_ops;
static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
int amd_iommu_max_glx_val = -1;
......@@ -1610,6 +1612,9 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid,
pte_root |= 1ULL << DEV_ENTRY_PPR;
}
if (domain->dirty_tracking)
pte_root |= DTE_FLAG_HAD;
if (domain->flags & PD_IOMMUV2_MASK) {
u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
u64 glx = domain->glx;
......@@ -2155,28 +2160,79 @@ static inline u64 dma_max_address(void)
return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
}
static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
static bool amd_iommu_hd_support(struct amd_iommu *iommu)
{
return iommu && (iommu->features & FEATURE_HDSUP);
}
static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
struct device *dev, u32 flags)
{
bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
struct protection_domain *domain;
struct amd_iommu *iommu = NULL;
if (dev) {
iommu = rlookup_amd_iommu(dev);
if (!iommu)
return ERR_PTR(-ENODEV);
}
/*
* Since DTE[Mode]=0 is prohibited on SNP-enabled system,
* default to use IOMMU_DOMAIN_DMA[_FQ].
*/
if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
return NULL;
return ERR_PTR(-EINVAL);
if (dirty_tracking && !amd_iommu_hd_support(iommu))
return ERR_PTR(-EOPNOTSUPP);
domain = protection_domain_alloc(type);
if (!domain)
return NULL;
return ERR_PTR(-ENOMEM);
domain->domain.geometry.aperture_start = 0;
domain->domain.geometry.aperture_end = dma_max_address();
domain->domain.geometry.force_aperture = true;
if (iommu) {
domain->domain.type = type;
domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
domain->domain.ops = iommu->iommu.ops->default_domain_ops;
if (dirty_tracking)
domain->domain.dirty_ops = &amd_dirty_ops;
}
return &domain->domain;
}
static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
{
struct iommu_domain *domain;
domain = do_iommu_domain_alloc(type, NULL, 0);
if (IS_ERR(domain))
return NULL;
return domain;
}
static struct iommu_domain *
amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
struct iommu_domain *parent,
const struct iommu_user_data *user_data)
{
unsigned int type = IOMMU_DOMAIN_UNMANAGED;
if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
return ERR_PTR(-EOPNOTSUPP);
return do_iommu_domain_alloc(type, dev, flags);
}
static void amd_iommu_domain_free(struct iommu_domain *dom)
{
struct protection_domain *domain;
......@@ -2214,6 +2270,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
dev_data->defer_attach = false;
/*
* Restrict to devices with compatible IOMMU hardware support
* when enforcement of dirty tracking is enabled.
*/
if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
return -EINVAL;
if (dev_data->domain)
detach_device(dev);
......@@ -2332,6 +2395,11 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
return true;
case IOMMU_CAP_DEFERRED_FLUSH:
return true;
case IOMMU_CAP_DIRTY_TRACKING: {
struct amd_iommu *iommu = rlookup_amd_iommu(dev);
return amd_iommu_hd_support(iommu);
}
default:
break;
}
......@@ -2339,6 +2407,73 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
return false;
}
static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
bool enable)
{
struct protection_domain *pdomain = to_pdomain(domain);
struct dev_table_entry *dev_table;
struct iommu_dev_data *dev_data;
bool domain_flush = false;
struct amd_iommu *iommu;
unsigned long flags;
u64 pte_root;
spin_lock_irqsave(&pdomain->lock, flags);
if (!(pdomain->dirty_tracking ^ enable)) {
spin_unlock_irqrestore(&pdomain->lock, flags);
return 0;
}
list_for_each_entry(dev_data, &pdomain->dev_list, list) {
iommu = rlookup_amd_iommu(dev_data->dev);
if (!iommu)
continue;
dev_table = get_dev_table(iommu);
pte_root = dev_table[dev_data->devid].data[0];
pte_root = (enable ? pte_root | DTE_FLAG_HAD :
pte_root & ~DTE_FLAG_HAD);
/* Flush device DTE */
dev_table[dev_data->devid].data[0] = pte_root;
device_flush_dte(dev_data);
domain_flush = true;
}
/* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
if (domain_flush) {
amd_iommu_domain_flush_tlb_pde(pdomain);
amd_iommu_domain_flush_complete(pdomain);
}
pdomain->dirty_tracking = enable;
spin_unlock_irqrestore(&pdomain->lock, flags);
return 0;
}
static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
unsigned long iova, size_t size,
unsigned long flags,
struct iommu_dirty_bitmap *dirty)
{
struct protection_domain *pdomain = to_pdomain(domain);
struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
unsigned long lflags;
if (!ops || !ops->read_and_clear_dirty)
return -EOPNOTSUPP;
spin_lock_irqsave(&pdomain->lock, lflags);
if (!pdomain->dirty_tracking && dirty->bitmap) {
spin_unlock_irqrestore(&pdomain->lock, lflags);
return -EINVAL;
}
spin_unlock_irqrestore(&pdomain->lock, lflags);
return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
}
static void amd_iommu_get_resv_regions(struct device *dev,
struct list_head *head)
{
......@@ -2461,9 +2596,15 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
return true;
}
const struct iommu_dirty_ops amd_dirty_ops = {
.set_dirty_tracking = amd_iommu_set_dirty_tracking,
.read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
};
const struct iommu_ops amd_iommu_ops = {
.capable = amd_iommu_capable,
.domain_alloc = amd_iommu_domain_alloc,
.domain_alloc_user = amd_iommu_domain_alloc_user,
.probe_device = amd_iommu_probe_device,
.release_device = amd_iommu_release_device,
.probe_finalize = amd_iommu_probe_finalize,
......
......@@ -15,6 +15,7 @@ config INTEL_IOMMU
select DMA_OPS
select IOMMU_API
select IOMMU_IOVA
select IOMMUFD_DRIVER if IOMMUFD
select NEED_DMA_MAP_STATE
select DMAR_TABLE
select SWIOTLB
......
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_DMAR_TABLE) += dmar.o
obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o
obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o
obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o
obj-$(CONFIG_DMAR_PERF) += perf.o
obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
......
......@@ -282,7 +282,6 @@ static LIST_HEAD(dmar_satc_units);
#define for_each_rmrr_units(rmrr) \
list_for_each_entry(rmrr, &dmar_rmrr_units, list)
static void device_block_translation(struct device *dev);
static void intel_iommu_domain_free(struct iommu_domain *domain);
int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
......@@ -300,6 +299,7 @@ static int iommu_skip_te_disable;
#define IDENTMAP_AZALIA 4
const struct iommu_ops intel_iommu_ops;
const struct iommu_dirty_ops intel_dirty_ops;
static bool translation_pre_enabled(struct intel_iommu *iommu)
{
......@@ -560,7 +560,7 @@ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
}
/* Some capabilities may be different across iommus */
static void domain_update_iommu_cap(struct dmar_domain *domain)
void domain_update_iommu_cap(struct dmar_domain *domain)
{
domain_update_iommu_coherency(domain);
domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
......@@ -1778,8 +1778,7 @@ static struct dmar_domain *alloc_domain(unsigned int type)
return domain;
}
static int domain_attach_iommu(struct dmar_domain *domain,
struct intel_iommu *iommu)
int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
{
struct iommu_domain_info *info, *curr;
unsigned long ndomains;
......@@ -1828,8 +1827,7 @@ static int domain_attach_iommu(struct dmar_domain *domain,
return ret;
}
static void domain_detach_iommu(struct dmar_domain *domain,
struct intel_iommu *iommu)
void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
{
struct iommu_domain_info *info;
......@@ -2196,6 +2194,11 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
return -EINVAL;
if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
return -EINVAL;
}
attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
attr |= DMA_FL_PTE_PRESENT;
if (domain->use_first_level) {
......@@ -3958,7 +3961,7 @@ static void dmar_remove_one_dev_info(struct device *dev)
* all DMA requests without PASID from the device are blocked. If the page
* table has been set, clean up the data structures.
*/
static void device_block_translation(struct device *dev)
void device_block_translation(struct device *dev)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
......@@ -4058,14 +4061,62 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
return NULL;
}
static struct iommu_domain *
intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
struct iommu_domain *parent,
const struct iommu_user_data *user_data)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
struct intel_iommu *iommu = info->iommu;
struct iommu_domain *domain;
/* Must be NESTING domain */
if (parent) {
if (!nested_supported(iommu) || flags)
return ERR_PTR(-EOPNOTSUPP);
return intel_nested_domain_alloc(parent, user_data);
}
if (flags &
(~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
return ERR_PTR(-EOPNOTSUPP);
if (nested_parent && !nested_supported(iommu))
return ERR_PTR(-EOPNOTSUPP);
if (user_data || (dirty_tracking && !ssads_supported(iommu)))
return ERR_PTR(-EOPNOTSUPP);
/*
* domain_alloc_user op needs to fully initialize a domain before
* return, so uses iommu_domain_alloc() here for simple.
*/
domain = iommu_domain_alloc(dev->bus);
if (!domain)
return ERR_PTR(-ENOMEM);
if (nested_parent)
to_dmar_domain(domain)->nested_parent = true;
if (dirty_tracking) {
if (to_dmar_domain(domain)->use_first_level) {
iommu_domain_free(domain);
return ERR_PTR(-EOPNOTSUPP);
}
domain->dirty_ops = &intel_dirty_ops;
}
return domain;
}
static void intel_iommu_domain_free(struct iommu_domain *domain)
{
if (domain != &si_domain->domain && domain != &blocking_domain)
domain_exit(to_dmar_domain(domain));
}
static int prepare_domain_attach_device(struct iommu_domain *domain,
struct device *dev)
int prepare_domain_attach_device(struct iommu_domain *domain,
struct device *dev)
{
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct intel_iommu *iommu;
......@@ -4078,6 +4129,9 @@ static int prepare_domain_attach_device(struct iommu_domain *domain,
if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
return -EINVAL;
if (domain->dirty_ops && !ssads_supported(iommu))
return -EINVAL;
/* check if this iommu agaw is sufficient for max mapped address */
addr_width = agaw_to_width(iommu->agaw);
if (addr_width > cap_mgaw(iommu->cap))
......@@ -4332,6 +4386,8 @@ static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
return dmar_platform_optin();
case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
return ecap_sc_support(info->iommu->ecap);
case IOMMU_CAP_DIRTY_TRACKING:
return ssads_supported(info->iommu);
default:
return false;
}
......@@ -4729,6 +4785,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
return -EOPNOTSUPP;
if (domain->dirty_ops)
return -EINVAL;
if (context_copied(iommu, info->bus, info->devfn))
return -EBUSY;
......@@ -4780,6 +4839,7 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
if (!vtd)
return ERR_PTR(-ENOMEM);
vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
vtd->cap_reg = iommu->cap;
vtd->ecap_reg = iommu->ecap;
*length = sizeof(*vtd);
......@@ -4787,10 +4847,88 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
return vtd;
}
static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
bool enable)
{
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct device_domain_info *info;
int ret;
spin_lock(&dmar_domain->lock);
if (dmar_domain->dirty_tracking == enable)
goto out_unlock;
list_for_each_entry(info, &dmar_domain->devices, link) {
ret = intel_pasid_setup_dirty_tracking(info->iommu,
info->domain, info->dev,
IOMMU_NO_PASID, enable);
if (ret)
goto err_unwind;
}
dmar_domain->dirty_tracking = enable;
out_unlock:
spin_unlock(&dmar_domain->lock);
return 0;
err_unwind:
list_for_each_entry(info, &dmar_domain->devices, link)
intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
info->dev, IOMMU_NO_PASID,
dmar_domain->dirty_tracking);
spin_unlock(&dmar_domain->lock);
return ret;
}
static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
unsigned long iova, size_t size,
unsigned long flags,
struct iommu_dirty_bitmap *dirty)
{
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
unsigned long end = iova + size - 1;
unsigned long pgsize;
/*
* IOMMUFD core calls into a dirty tracking disabled domain without an
* IOVA bitmap set in order to clean dirty bits in all PTEs that might
* have occurred when we stopped dirty tracking. This ensures that we
* never inherit dirtied bits from a previous cycle.
*/
if (!dmar_domain->dirty_tracking && dirty->bitmap)
return -EINVAL;
do {
struct dma_pte *pte;
int lvl = 0;
pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
GFP_ATOMIC);
pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
if (!pte || !dma_pte_present(pte)) {
iova += pgsize;
continue;
}
if (dma_sl_pte_test_and_clear_dirty(pte, flags))
iommu_dirty_bitmap_record(dirty, iova, pgsize);
iova += pgsize;
} while (iova < end);
return 0;
}
const struct iommu_dirty_ops intel_dirty_ops = {
.set_dirty_tracking = intel_iommu_set_dirty_tracking,
.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
};
const struct iommu_ops intel_iommu_ops = {
.capable = intel_iommu_capable,
.hw_info = intel_iommu_hw_info,
.domain_alloc = intel_iommu_domain_alloc,
.domain_alloc_user = intel_iommu_domain_alloc_user,
.probe_device = intel_iommu_probe_device,
.probe_finalize = intel_iommu_probe_finalize,
.release_device = intel_iommu_release_device,
......
......@@ -25,6 +25,7 @@
#include <asm/cacheflush.h>
#include <asm/iommu.h>
#include <uapi/linux/iommufd.h>
/*
* VT-d hardware uses 4KiB page size regardless of host page size.
......@@ -48,6 +49,9 @@
#define DMA_FL_PTE_DIRTY BIT_ULL(6)
#define DMA_FL_PTE_XD BIT_ULL(63)
#define DMA_SL_PTE_DIRTY_BIT 9
#define DMA_SL_PTE_DIRTY BIT_ULL(DMA_SL_PTE_DIRTY_BIT)
#define ADDR_WIDTH_5LEVEL (57)
#define ADDR_WIDTH_4LEVEL (48)
......@@ -539,6 +543,10 @@ enum {
#define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap))
#define pasid_supported(iommu) (sm_supported(iommu) && \
ecap_pasid((iommu)->ecap))
#define ssads_supported(iommu) (sm_supported(iommu) && \
ecap_slads((iommu)->ecap))
#define nested_supported(iommu) (sm_supported(iommu) && \
ecap_nest((iommu)->ecap))
struct pasid_entry;
struct pasid_state_entry;
......@@ -592,20 +600,45 @@ struct dmar_domain {
* otherwise, goes through the second
* level.
*/
u8 dirty_tracking:1; /* Dirty tracking is enabled */
u8 nested_parent:1; /* Has other domains nested on it */
spinlock_t lock; /* Protect device tracking lists */
struct list_head devices; /* all devices' list */
struct list_head dev_pasids; /* all attached pasids */
struct dma_pte *pgd; /* virtual address */
int gaw; /* max guest address width */
/* adjusted guest address width, 0 is level 2 30-bit */
int agaw;
int iommu_superpage;/* Level of superpages supported:
0 == 4KiB (no superpages), 1 == 2MiB,
2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
u64 max_addr; /* maximum mapped address */
union {
/* DMA remapping domain */
struct {
/* virtual address */
struct dma_pte *pgd;
/* max guest address width */
int gaw;
/*
* adjusted guest address width:
* 0: level 2 30-bit
* 1: level 3 39-bit
* 2: level 4 48-bit
* 3: level 5 57-bit
*/
int agaw;
/* maximum mapped address */
u64 max_addr;
};
/* Nested user domain */
struct {
/* parent page table which the user domain is nested on */
struct dmar_domain *s2_domain;
/* user page table pointer (in GPA) */
unsigned long s1_pgtbl;
/* page table attributes */
struct iommu_hwpt_vtd_s1 s1_cfg;
};
};
struct iommu_domain domain; /* generic domain data structure for
iommu core */
......@@ -781,6 +814,16 @@ static inline bool dma_pte_present(struct dma_pte *pte)
return (pte->val & 3) != 0;
}
static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
unsigned long flags)
{
if (flags & IOMMU_DIRTY_NO_CLEAR)
return (pte->val & DMA_SL_PTE_DIRTY) != 0;
return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT,
(unsigned long *)&pte->val);
}
static inline bool dma_pte_superpage(struct dma_pte *pte)
{
return (pte->val & DMA_PTE_LARGE_PAGE);
......@@ -836,12 +879,21 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
*/
#define QI_OPT_WAIT_DRAIN BIT(0)
int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
void device_block_translation(struct device *dev);
int prepare_domain_attach_device(struct iommu_domain *domain,
struct device *dev);
void domain_update_iommu_cap(struct dmar_domain *domain);
int dmar_ir_support(void);
void *alloc_pgtable_page(int node, gfp_t gfp);
void free_pgtable_page(void *vaddr);
void iommu_flush_write_buffer(struct intel_iommu *iommu);
struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
const struct iommu_user_data *user_data);
#ifdef CONFIG_INTEL_IOMMU_SVM
void intel_svm_check(struct intel_iommu *iommu);
......
// SPDX-License-Identifier: GPL-2.0
/*
* nested.c - nested mode translation support
*
* Copyright (C) 2023 Intel Corporation
*
* Author: Lu Baolu <baolu.lu@linux.intel.com>
* Jacob Pan <jacob.jun.pan@linux.intel.com>
* Yi Liu <yi.l.liu@intel.com>
*/
#define pr_fmt(fmt) "DMAR: " fmt
#include <linux/iommu.h>
#include <linux/pci.h>
#include <linux/pci-ats.h>
#include "iommu.h"
#include "pasid.h"
static int intel_nested_attach_dev(struct iommu_domain *domain,
struct device *dev)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct intel_iommu *iommu = info->iommu;
unsigned long flags;
int ret = 0;
if (info->domain)
device_block_translation(dev);
if (iommu->agaw < dmar_domain->s2_domain->agaw) {
dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n");
return -ENODEV;
}
/*
* Stage-1 domain cannot work alone, it is nested on a s2_domain.
* The s2_domain will be used in nested translation, hence needs
* to ensure the s2_domain is compatible with this IOMMU.
*/
ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev);
if (ret) {
dev_err_ratelimited(dev, "s2 domain is not compatible\n");
return ret;
}
ret = domain_attach_iommu(dmar_domain, iommu);
if (ret) {
dev_err_ratelimited(dev, "Failed to attach domain to iommu\n");
return ret;
}
ret = intel_pasid_setup_nested(iommu, dev,
IOMMU_NO_PASID, dmar_domain);
if (ret) {
domain_detach_iommu(dmar_domain, iommu);
dev_err_ratelimited(dev, "Failed to setup pasid entry\n");
return ret;
}
info->domain = dmar_domain;
spin_lock_irqsave(&dmar_domain->lock, flags);
list_add(&info->link, &dmar_domain->devices);
spin_unlock_irqrestore(&dmar_domain->lock, flags);
return 0;
}
static void intel_nested_domain_free(struct iommu_domain *domain)
{
kfree(to_dmar_domain(domain));
}
static const struct iommu_domain_ops intel_nested_domain_ops = {
.attach_dev = intel_nested_attach_dev,
.free = intel_nested_domain_free,
};
struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
const struct iommu_user_data *user_data)
{
struct dmar_domain *s2_domain = to_dmar_domain(parent);
struct iommu_hwpt_vtd_s1 vtd;
struct dmar_domain *domain;
int ret;
/* Must be nested domain */
if (user_data->type != IOMMU_HWPT_DATA_VTD_S1)
return ERR_PTR(-EOPNOTSUPP);
if (parent->ops != intel_iommu_ops.default_domain_ops ||
!s2_domain->nested_parent)
return ERR_PTR(-EINVAL);
ret = iommu_copy_struct_from_user(&vtd, user_data,
IOMMU_HWPT_DATA_VTD_S1, __reserved);
if (ret)
return ERR_PTR(ret);
domain = kzalloc(sizeof(*domain), GFP_KERNEL_ACCOUNT);
if (!domain)
return ERR_PTR(-ENOMEM);
domain->use_first_level = true;
domain->s2_domain = s2_domain;
domain->s1_pgtbl = vtd.pgtbl_addr;
domain->s1_cfg = vtd;
domain->domain.ops = &intel_nested_domain_ops;
domain->domain.type = IOMMU_DOMAIN_NESTED;
INIT_LIST_HEAD(&domain->devices);
INIT_LIST_HEAD(&domain->dev_pasids);
spin_lock_init(&domain->lock);
xa_init(&domain->iommu_array);
return &domain->domain;
}
......@@ -277,6 +277,11 @@ static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits)
WRITE_ONCE(*ptr, (old & ~mask) | bits);
}
static inline u64 pasid_get_bits(u64 *ptr)
{
return READ_ONCE(*ptr);
}
/*
* Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode
* PASID entry.
......@@ -335,6 +340,45 @@ static inline void pasid_set_fault_enable(struct pasid_entry *pe)
pasid_set_bits(&pe->val[0], 1 << 1, 0);
}
/*
* Enable second level A/D bits by setting the SLADE (Second Level
* Access Dirty Enable) field (Bit 9) of a scalable mode PASID
* entry.
*/
static inline void pasid_set_ssade(struct pasid_entry *pe)
{
pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9);
}
/*
* Disable second level A/D bits by clearing the SLADE (Second Level
* Access Dirty Enable) field (Bit 9) of a scalable mode PASID
* entry.
*/
static inline void pasid_clear_ssade(struct pasid_entry *pe)
{
pasid_set_bits(&pe->val[0], 1 << 9, 0);
}
/*
* Checks if second level A/D bits specifically the SLADE (Second Level
* Access Dirty Enable) field (Bit 9) of a scalable mode PASID
* entry is set.
*/
static inline bool pasid_get_ssade(struct pasid_entry *pe)
{
return pasid_get_bits(&pe->val[0]) & (1 << 9);
}
/*
* Setup the SRE(Supervisor Request Enable) field (Bit 128) of a
* scalable mode PASID entry.
*/
static inline void pasid_set_sre(struct pasid_entry *pe)
{
pasid_set_bits(&pe->val[2], 1 << 0, 1);
}
/*
* Setup the WPE(Write Protect Enable) field (Bit 132) of a
* scalable mode PASID entry.
......@@ -402,6 +446,15 @@ pasid_set_flpm(struct pasid_entry *pe, u64 value)
pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2);
}
/*
* Setup the Extended Access Flag Enable (EAFE) field (Bit 135)
* of a scalable mode PASID entry.
*/
static inline void pasid_set_eafe(struct pasid_entry *pe)
{
pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
}
static void
pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
u16 did, u32 pasid)
......@@ -627,6 +680,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
pasid_set_fault_enable(pte);
pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
if (domain->dirty_tracking)
pasid_set_ssade(pte);
pasid_set_present(pte);
spin_unlock(&iommu->lock);
......@@ -636,6 +691,78 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
return 0;
}
/*
* Set up dirty tracking on a second only or nested translation type.
*/
int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid,
bool enabled)
{
struct pasid_entry *pte;
u16 did, pgtt;
spin_lock(&iommu->lock);
pte = intel_pasid_get_entry(dev, pasid);
if (!pte) {
spin_unlock(&iommu->lock);
dev_err_ratelimited(
dev, "Failed to get pasid entry of PASID %d\n", pasid);
return -ENODEV;
}
did = domain_id_iommu(domain, iommu);
pgtt = pasid_pte_get_pgtt(pte);
if (pgtt != PASID_ENTRY_PGTT_SL_ONLY &&
pgtt != PASID_ENTRY_PGTT_NESTED) {
spin_unlock(&iommu->lock);
dev_err_ratelimited(
dev,
"Dirty tracking not supported on translation type %d\n",
pgtt);
return -EOPNOTSUPP;
}
if (pasid_get_ssade(pte) == enabled) {
spin_unlock(&iommu->lock);
return 0;
}
if (enabled)
pasid_set_ssade(pte);
else
pasid_clear_ssade(pte);
spin_unlock(&iommu->lock);
if (!ecap_coherent(iommu->ecap))
clflush_cache_range(pte, sizeof(*pte));
/*
* From VT-d spec table 25 "Guidance to Software for Invalidations":
*
* - PASID-selective-within-Domain PASID-cache invalidation
* If (PGTT=SS or Nested)
* - Domain-selective IOTLB invalidation
* Else
* - PASID-selective PASID-based IOTLB invalidation
* - If (pasid is RID_PASID)
* - Global Device-TLB invalidation to affected functions
* Else
* - PASID-based Device-TLB invalidation (with S=1 and
* Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions
*/
pasid_cache_invalidation_with_pasid(iommu, did, pasid);
iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
/* Device IOTLB doesn't need to be flushed in caching mode. */
if (!cap_caching_mode(iommu->cap))
devtlb_invalidation_with_pasid(iommu, dev, pasid);
return 0;
}
/*
* Set up the scalable mode pasid entry for passthrough translation type.
*/
......@@ -713,3 +840,97 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu,
if (!cap_caching_mode(iommu->cap))
devtlb_invalidation_with_pasid(iommu, dev, pasid);
}
/**
* intel_pasid_setup_nested() - Set up PASID entry for nested translation.
* @iommu: IOMMU which the device belong to
* @dev: Device to be set up for translation
* @pasid: PASID to be programmed in the device PASID table
* @domain: User stage-1 domain nested on a stage-2 domain
*
* This is used for nested translation. The input domain should be
* nested type and nested on a parent with 'is_nested_parent' flag
* set.
*/
int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
u32 pasid, struct dmar_domain *domain)
{
struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg;
pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl;
struct dmar_domain *s2_domain = domain->s2_domain;
u16 did = domain_id_iommu(domain, iommu);
struct dma_pte *pgd = s2_domain->pgd;
struct pasid_entry *pte;
/* Address width should match the address width supported by hardware */
switch (s1_cfg->addr_width) {
case ADDR_WIDTH_4LEVEL:
break;
case ADDR_WIDTH_5LEVEL:
if (!cap_fl5lp_support(iommu->cap)) {
dev_err_ratelimited(dev,
"5-level paging not supported\n");
return -EINVAL;
}
break;
default:
dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n",
s1_cfg->addr_width);
return -EINVAL;
}
if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) {
pr_err_ratelimited("No supervisor request support on %s\n",
iommu->name);
return -EINVAL;
}
if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) {
pr_err_ratelimited("No extended access flag support on %s\n",
iommu->name);
return -EINVAL;
}
spin_lock(&iommu->lock);
pte = intel_pasid_get_entry(dev, pasid);
if (!pte) {
spin_unlock(&iommu->lock);
return -ENODEV;
}
if (pasid_pte_is_present(pte)) {
spin_unlock(&iommu->lock);
return -EBUSY;
}
pasid_clear_entry(pte);
if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL)
pasid_set_flpm(pte, 1);
pasid_set_flptr(pte, (uintptr_t)s1_gpgd);
if (s1_cfg->flags & IOMMU_VTD_S1_SRE) {
pasid_set_sre(pte);
if (s1_cfg->flags & IOMMU_VTD_S1_WPE)
pasid_set_wpe(pte);
}
if (s1_cfg->flags & IOMMU_VTD_S1_EAFE)
pasid_set_eafe(pte);
if (s2_domain->force_snooping)
pasid_set_pgsnp(pte);
pasid_set_slptr(pte, virt_to_phys(pgd));
pasid_set_fault_enable(pte);
pasid_set_domain_id(pte, did);
pasid_set_address_width(pte, s2_domain->agaw);
pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
pasid_set_present(pte);
spin_unlock(&iommu->lock);
pasid_flush_caches(iommu, pte, pasid, did);
return 0;
}
......@@ -106,9 +106,15 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
int intel_pasid_setup_second_level(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid);
int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid,
bool enabled);
int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid);
int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
u32 pasid, struct dmar_domain *domain);
void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
struct device *dev, u32 pasid,
bool fault_ignore);
......
......@@ -11,3 +11,4 @@ iommufd-y := \
iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
obj-$(CONFIG_IOMMUFD) += iommufd.o
obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o
This diff is collapsed.
This diff is collapsed.
......@@ -15,6 +15,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/errno.h>
#include <uapi/linux/iommufd.h>
#include "io_pagetable.h"
#include "double_span.h"
......@@ -221,6 +222,18 @@ static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
return 0;
}
static struct iopt_area *iopt_area_alloc(void)
{
struct iopt_area *area;
area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
if (!area)
return NULL;
RB_CLEAR_NODE(&area->node.rb);
RB_CLEAR_NODE(&area->pages_node.rb);
return area;
}
static int iopt_alloc_area_pages(struct io_pagetable *iopt,
struct list_head *pages_list,
unsigned long length, unsigned long *dst_iova,
......@@ -231,7 +244,7 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt,
int rc = 0;
list_for_each_entry(elm, pages_list, next) {
elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT);
elm->area = iopt_area_alloc();
if (!elm->area)
return -ENOMEM;
}
......@@ -412,6 +425,177 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
return 0;
}
struct iova_bitmap_fn_arg {
unsigned long flags;
struct io_pagetable *iopt;
struct iommu_domain *domain;
struct iommu_dirty_bitmap *dirty;
};
static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
unsigned long iova, size_t length,
void *opaque)
{
struct iopt_area *area;
struct iopt_area_contig_iter iter;
struct iova_bitmap_fn_arg *arg = opaque;
struct iommu_domain *domain = arg->domain;
struct iommu_dirty_bitmap *dirty = arg->dirty;
const struct iommu_dirty_ops *ops = domain->dirty_ops;
unsigned long last_iova = iova + length - 1;
unsigned long flags = arg->flags;
int ret;
iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
unsigned long last = min(last_iova, iopt_area_last_iova(area));
ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
last - iter.cur_iova + 1, flags,
dirty);
if (ret)
return ret;
}
if (!iopt_area_contig_done(&iter))
return -EINVAL;
return 0;
}
static int
iommu_read_and_clear_dirty(struct iommu_domain *domain,
struct io_pagetable *iopt, unsigned long flags,
struct iommu_hwpt_get_dirty_bitmap *bitmap)
{
const struct iommu_dirty_ops *ops = domain->dirty_ops;
struct iommu_iotlb_gather gather;
struct iommu_dirty_bitmap dirty;
struct iova_bitmap_fn_arg arg;
struct iova_bitmap *iter;
int ret = 0;
if (!ops || !ops->read_and_clear_dirty)
return -EOPNOTSUPP;
iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
bitmap->page_size,
u64_to_user_ptr(bitmap->data));
if (IS_ERR(iter))
return -ENOMEM;
iommu_dirty_bitmap_init(&dirty, iter, &gather);
arg.flags = flags;
arg.iopt = iopt;
arg.domain = domain;
arg.dirty = &dirty;
iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
if (!(flags & IOMMU_DIRTY_NO_CLEAR))
iommu_iotlb_sync(domain, &gather);
iova_bitmap_free(iter);
return ret;
}
int iommufd_check_iova_range(struct io_pagetable *iopt,
struct iommu_hwpt_get_dirty_bitmap *bitmap)
{
size_t iommu_pgsize = iopt->iova_alignment;
u64 last_iova;
if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
return -EOVERFLOW;
if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
return -EOVERFLOW;
if ((bitmap->iova & (iommu_pgsize - 1)) ||
((last_iova + 1) & (iommu_pgsize - 1)))
return -EINVAL;
if (!bitmap->page_size)
return -EINVAL;
if ((bitmap->iova & (bitmap->page_size - 1)) ||
((last_iova + 1) & (bitmap->page_size - 1)))
return -EINVAL;
return 0;
}
int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
struct iommu_domain *domain,
unsigned long flags,
struct iommu_hwpt_get_dirty_bitmap *bitmap)
{
int ret;
ret = iommufd_check_iova_range(iopt, bitmap);
if (ret)
return ret;
down_read(&iopt->iova_rwsem);
ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
up_read(&iopt->iova_rwsem);
return ret;
}
static int iopt_clear_dirty_data(struct io_pagetable *iopt,
struct iommu_domain *domain)
{
const struct iommu_dirty_ops *ops = domain->dirty_ops;
struct iommu_iotlb_gather gather;
struct iommu_dirty_bitmap dirty;
struct iopt_area *area;
int ret = 0;
lockdep_assert_held_read(&iopt->iova_rwsem);
iommu_dirty_bitmap_init(&dirty, NULL, &gather);
for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
if (!area->pages)
continue;
ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
iopt_area_length(area), 0,
&dirty);
if (ret)
break;
}
iommu_iotlb_sync(domain, &gather);
return ret;
}
int iopt_set_dirty_tracking(struct io_pagetable *iopt,
struct iommu_domain *domain, bool enable)
{
const struct iommu_dirty_ops *ops = domain->dirty_ops;
int ret = 0;
if (!ops)
return -EOPNOTSUPP;
down_read(&iopt->iova_rwsem);
/* Clear dirty bits from PTEs to ensure a clean snapshot */
if (enable) {
ret = iopt_clear_dirty_data(iopt, domain);
if (ret)
goto out_unlock;
}
ret = ops->set_dirty_tracking(domain, enable);
out_unlock:
up_read(&iopt->iova_rwsem);
return ret;
}
int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
unsigned long length, struct list_head *pages_list)
{
......@@ -1005,11 +1189,11 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova)
iopt_area_start_byte(area, new_start) & (alignment - 1))
return -EINVAL;
lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
lhs = iopt_area_alloc();
if (!lhs)
return -ENOMEM;
rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
rhs = iopt_area_alloc();
if (!rhs) {
rc = -ENOMEM;
goto err_free_lhs;
......@@ -1048,6 +1232,16 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova)
if (WARN_ON(rc))
goto err_remove_lhs;
/*
* If the original area has filled a domain, domains_itree has to be
* updated.
*/
if (area->storage_domain) {
interval_tree_remove(&area->pages_node, &pages->domains_itree);
interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
}
lhs->storage_domain = area->storage_domain;
lhs->pages = area->pages;
rhs->storage_domain = area->storage_domain;
......
......@@ -8,6 +8,9 @@
#include <linux/xarray.h>
#include <linux/refcount.h>
#include <linux/uaccess.h>
#include <linux/iommu.h>
#include <linux/iova_bitmap.h>
#include <uapi/linux/iommufd.h>
struct iommu_domain;
struct iommu_group;
......@@ -70,6 +73,13 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
unsigned long length, unsigned long *unmapped);
int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped);
int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
struct iommu_domain *domain,
unsigned long flags,
struct iommu_hwpt_get_dirty_bitmap *bitmap);
int iopt_set_dirty_tracking(struct io_pagetable *iopt,
struct iommu_domain *domain, bool enable);
void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
unsigned long length);
int iopt_table_add_domain(struct io_pagetable *iopt,
......@@ -113,7 +123,8 @@ enum iommufd_object_type {
IOMMUFD_OBJ_NONE,
IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
IOMMUFD_OBJ_DEVICE,
IOMMUFD_OBJ_HW_PAGETABLE,
IOMMUFD_OBJ_HWPT_PAGING,
IOMMUFD_OBJ_HWPT_NESTED,
IOMMUFD_OBJ_IOAS,
IOMMUFD_OBJ_ACCESS,
#ifdef CONFIG_IOMMUFD_TEST
......@@ -171,7 +182,7 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
size_t size,
enum iommufd_object_type type);
#define iommufd_object_alloc(ictx, ptr, type) \
#define __iommufd_object_alloc(ictx, ptr, type, obj) \
container_of(_iommufd_object_alloc( \
ictx, \
sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \
......@@ -180,6 +191,9 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
type), \
typeof(*(ptr)), obj)
#define iommufd_object_alloc(ictx, ptr, type) \
__iommufd_object_alloc(ictx, ptr, type, obj)
/*
* The IO Address Space (IOAS) pagetable is a virtual page table backed by the
* io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The
......@@ -222,6 +236,8 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd,
struct iommufd_ctx *ictx);
int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
int iommufd_check_iova_range(struct io_pagetable *iopt,
struct iommu_hwpt_get_dirty_bitmap *bitmap);
/*
* A HW pagetable is called an iommu_domain inside the kernel. This user object
......@@ -231,35 +247,75 @@ int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
*/
struct iommufd_hw_pagetable {
struct iommufd_object obj;
struct iommufd_ioas *ioas;
struct iommu_domain *domain;
};
struct iommufd_hwpt_paging {
struct iommufd_hw_pagetable common;
struct iommufd_ioas *ioas;
bool auto_domain : 1;
bool enforce_cache_coherency : 1;
bool msi_cookie : 1;
bool nest_parent : 1;
/* Head at iommufd_ioas::hwpt_list */
struct list_head hwpt_item;
};
struct iommufd_hw_pagetable *
iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
struct iommufd_device *idev, bool immediate_attach);
int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt);
struct iommufd_hwpt_nested {
struct iommufd_hw_pagetable common;
struct iommufd_hwpt_paging *parent;
};
static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt)
{
return hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING;
}
static inline struct iommufd_hwpt_paging *
to_hwpt_paging(struct iommufd_hw_pagetable *hwpt)
{
return container_of(hwpt, struct iommufd_hwpt_paging, common);
}
static inline struct iommufd_hwpt_paging *
iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id)
{
return container_of(iommufd_get_object(ucmd->ictx, id,
IOMMUFD_OBJ_HWPT_PAGING),
struct iommufd_hwpt_paging, common.obj);
}
int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd);
int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd);
struct iommufd_hwpt_paging *
iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
struct iommufd_device *idev, u32 flags,
bool immediate_attach,
const struct iommu_user_data *user_data);
int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
struct iommufd_device *idev);
struct iommufd_hw_pagetable *
iommufd_hw_pagetable_detach(struct iommufd_device *idev);
void iommufd_hw_pagetable_destroy(struct iommufd_object *obj);
void iommufd_hw_pagetable_abort(struct iommufd_object *obj);
void iommufd_hwpt_paging_destroy(struct iommufd_object *obj);
void iommufd_hwpt_paging_abort(struct iommufd_object *obj);
void iommufd_hwpt_nested_destroy(struct iommufd_object *obj);
void iommufd_hwpt_nested_abort(struct iommufd_object *obj);
int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd);
static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx,
struct iommufd_hw_pagetable *hwpt)
{
lockdep_assert_not_held(&hwpt->ioas->mutex);
if (hwpt->auto_domain)
iommufd_object_deref_user(ictx, &hwpt->obj);
else
refcount_dec(&hwpt->obj.users);
if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) {
struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt);
lockdep_assert_not_held(&hwpt_paging->ioas->mutex);
if (hwpt_paging->auto_domain) {
iommufd_object_deref_user(ictx, &hwpt->obj);
return;
}
}
refcount_dec(&hwpt->obj.users);
}
struct iommufd_group {
......
......@@ -19,6 +19,8 @@ enum {
IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT,
IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE,
IOMMU_TEST_OP_ACCESS_REPLACE_IOAS,
IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS,
IOMMU_TEST_OP_DIRTY,
};
enum {
......@@ -40,6 +42,15 @@ enum {
MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES = 1 << 0,
};
enum {
MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0,
};
enum {
MOCK_NESTED_DOMAIN_IOTLB_ID_MAX = 3,
MOCK_NESTED_DOMAIN_IOTLB_NUM = 4,
};
struct iommu_test_cmd {
__u32 size;
__u32 op;
......@@ -56,6 +67,13 @@ struct iommu_test_cmd {
/* out_idev_id is the standard iommufd_bind object */
__u32 out_idev_id;
} mock_domain;
struct {
__u32 out_stdev_id;
__u32 out_hwpt_id;
__u32 out_idev_id;
/* Expand mock_domain to set mock device flags */
__u32 dev_flags;
} mock_domain_flags;
struct {
__u32 pt_id;
} mock_domain_replace;
......@@ -95,6 +113,14 @@ struct iommu_test_cmd {
struct {
__u32 ioas_id;
} access_replace_ioas;
struct {
__u32 flags;
__aligned_u64 iova;
__aligned_u64 length;
__aligned_u64 page_size;
__aligned_u64 uptr;
__aligned_u64 out_nr_dirty;
} dirty;
};
__u32 last;
};
......@@ -109,4 +135,17 @@ struct iommu_test_hw_info {
__u32 test_reg;
};
/* Should not be equal to any defined value in enum iommu_hwpt_data_type */
#define IOMMU_HWPT_DATA_SELFTEST 0xdead
#define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef
/**
* struct iommu_hwpt_selftest
*
* @iotlb: default mock iotlb value, IOMMU_TEST_IOTLB_DEFAULT
*/
struct iommu_hwpt_selftest {
__u32 iotlb;
};
#endif
......@@ -268,6 +268,7 @@ struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
iova_bitmap_free(bitmap);
return ERR_PTR(rc);
}
EXPORT_SYMBOL_NS_GPL(iova_bitmap_alloc, IOMMUFD);
/**
* iova_bitmap_free() - Frees an IOVA bitmap object
......@@ -289,6 +290,7 @@ void iova_bitmap_free(struct iova_bitmap *bitmap)
kfree(bitmap);
}
EXPORT_SYMBOL_NS_GPL(iova_bitmap_free, IOMMUFD);
/*
* Returns the remaining bitmap indexes from mapped_total_index to process for
......@@ -387,6 +389,7 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
return ret;
}
EXPORT_SYMBOL_NS_GPL(iova_bitmap_for_each, IOMMUFD);
/**
* iova_bitmap_set() - Records an IOVA range in bitmap
......@@ -420,4 +423,4 @@ void iova_bitmap_set(struct iova_bitmap *bitmap,
cur_bit += nbits;
} while (cur_bit <= last_bit);
}
EXPORT_SYMBOL_GPL(iova_bitmap_set);
EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, IOMMUFD);
......@@ -307,6 +307,8 @@ union ucmd_buffer {
struct iommu_destroy destroy;
struct iommu_hw_info info;
struct iommu_hwpt_alloc hwpt;
struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
struct iommu_hwpt_set_dirty_tracking set_dirty_tracking;
struct iommu_ioas_alloc alloc;
struct iommu_ioas_allow_iovas allow_iovas;
struct iommu_ioas_copy ioas_copy;
......@@ -342,6 +344,10 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
__reserved),
IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
__reserved),
IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap,
struct iommu_hwpt_get_dirty_bitmap, data),
IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking,
struct iommu_hwpt_set_dirty_tracking, __reserved),
IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
struct iommu_ioas_alloc, out_ioas_id),
IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
......@@ -482,9 +488,13 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
[IOMMUFD_OBJ_IOAS] = {
.destroy = iommufd_ioas_destroy,
},
[IOMMUFD_OBJ_HW_PAGETABLE] = {
.destroy = iommufd_hw_pagetable_destroy,
.abort = iommufd_hw_pagetable_abort,
[IOMMUFD_OBJ_HWPT_PAGING] = {
.destroy = iommufd_hwpt_paging_destroy,
.abort = iommufd_hwpt_paging_abort,
},
[IOMMUFD_OBJ_HWPT_NESTED] = {
.destroy = iommufd_hwpt_nested_destroy,
.abort = iommufd_hwpt_nested_abort,
},
#ifdef CONFIG_IOMMUFD_TEST
[IOMMUFD_OBJ_SELFTEST] = {
......@@ -552,5 +562,6 @@ MODULE_ALIAS_MISCDEV(VFIO_MINOR);
MODULE_ALIAS("devname:vfio/vfio");
#endif
MODULE_IMPORT_NS(IOMMUFD_INTERNAL);
MODULE_IMPORT_NS(IOMMUFD);
MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices");
MODULE_LICENSE("GPL");
......@@ -1507,6 +1507,8 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages)
area, domain, iopt_area_index(area),
iopt_area_last_index(area));
if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb));
interval_tree_remove(&area->pages_node, &pages->domains_itree);
iopt_area_unfill_domain(area, pages, area->storage_domain);
area->storage_domain = NULL;
......
This diff is collapsed.
......@@ -255,7 +255,7 @@ static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd,
static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
{
struct iommufd_hw_pagetable *hwpt;
struct iommufd_hwpt_paging *hwpt_paging;
struct iommufd_ioas *ioas;
int rc = 1;
......@@ -264,8 +264,8 @@ static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
return PTR_ERR(ioas);
mutex_lock(&ioas->mutex);
list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
if (!hwpt->enforce_cache_coherency) {
list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) {
if (!hwpt_paging->enforce_cache_coherency) {
rc = 0;
break;
}
......
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_VFIO) += vfio.o
vfio-y += vfio_main.o \
iova_bitmap.o
vfio-y += vfio_main.o
vfio-$(CONFIG_VFIO_DEVICE_CDEV) += device_cdev.o
vfio-$(CONFIG_VFIO_GROUP) += group.o
vfio-$(CONFIG_IOMMUFD) += iommufd.o
......
......@@ -3,6 +3,7 @@ config MLX5_VFIO_PCI
tristate "VFIO support for MLX5 PCI devices"
depends on MLX5_CORE
select VFIO_PCI_CORE
select IOMMUFD_DRIVER
help
This provides migration support for MLX5 devices using the VFIO
framework.
......
......@@ -1517,6 +1517,7 @@ static struct pci_driver mlx5vf_pci_driver = {
module_pci_driver(mlx5vf_pci_driver);
MODULE_IMPORT_NS(IOMMUFD);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
......
......@@ -5,6 +5,7 @@ config PDS_VFIO_PCI
tristate "VFIO support for PDS PCI devices"
depends on PDS_CORE && PCI_IOV
select VFIO_PCI_CORE
select IOMMUFD_DRIVER
help
This provides generic PCI support for PDS devices using the VFIO
framework.
......
......@@ -204,6 +204,7 @@ static struct pci_driver pds_vfio_pci_driver = {
module_pci_driver(pds_vfio_pci_driver);
MODULE_IMPORT_NS(IOMMUFD);
MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION);
MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>");
MODULE_LICENSE("GPL");
......@@ -1703,6 +1703,7 @@ static void __exit vfio_cleanup(void)
module_init(vfio_init);
module_exit(vfio_cleanup);
MODULE_IMPORT_NS(IOMMUFD);
MODULE_VERSION(DRIVER_VERSION);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(DRIVER_AUTHOR);
......
......@@ -166,6 +166,10 @@ struct io_pgtable_ops {
struct iommu_iotlb_gather *gather);
phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops,
unsigned long iova);
int (*read_and_clear_dirty)(struct io_pgtable_ops *ops,
unsigned long iova, size_t size,
unsigned long flags,
struct iommu_dirty_bitmap *dirty);
};
/**
......
......@@ -13,6 +13,7 @@
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/of.h>
#include <linux/iova_bitmap.h>
#include <uapi/linux/iommu.h>
#define IOMMU_READ (1 << 0)
......@@ -37,6 +38,7 @@ struct bus_type;
struct device;
struct iommu_domain;
struct iommu_domain_ops;
struct iommu_dirty_ops;
struct notifier_block;
struct iommu_sva;
struct iommu_fault_event;
......@@ -65,6 +67,9 @@ struct iommu_domain_geometry {
#define __IOMMU_DOMAIN_SVA (1U << 4) /* Shared process address space */
#define __IOMMU_DOMAIN_NESTED (1U << 6) /* User-managed address space nested
on a stage-2 translation */
#define IOMMU_DOMAIN_ALLOC_FLAGS ~__IOMMU_DOMAIN_DMA_FQ
/*
* This are the possible domain-types
......@@ -91,10 +96,13 @@ struct iommu_domain_geometry {
__IOMMU_DOMAIN_DMA_API | \
__IOMMU_DOMAIN_DMA_FQ)
#define IOMMU_DOMAIN_SVA (__IOMMU_DOMAIN_SVA)
#define IOMMU_DOMAIN_NESTED (__IOMMU_DOMAIN_NESTED)
struct iommu_domain {
unsigned type;
const struct iommu_domain_ops *ops;
const struct iommu_dirty_ops *dirty_ops;
unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */
struct iommu_domain_geometry geometry;
struct iommu_dma_cookie *iova_cookie;
......@@ -133,6 +141,7 @@ enum iommu_cap {
* usefully support the non-strict DMA flush queue.
*/
IOMMU_CAP_DEFERRED_FLUSH,
IOMMU_CAP_DIRTY_TRACKING, /* IOMMU supports dirty tracking */
};
/* These are the possible reserved region types */
......@@ -227,6 +236,90 @@ struct iommu_iotlb_gather {
bool queued;
};
/**
* struct iommu_dirty_bitmap - Dirty IOVA bitmap state
* @bitmap: IOVA bitmap
* @gather: Range information for a pending IOTLB flush
*/
struct iommu_dirty_bitmap {
struct iova_bitmap *bitmap;
struct iommu_iotlb_gather *gather;
};
/* Read but do not clear any dirty bits */
#define IOMMU_DIRTY_NO_CLEAR (1 << 0)
/**
* struct iommu_dirty_ops - domain specific dirty tracking operations
* @set_dirty_tracking: Enable or Disable dirty tracking on the iommu domain
* @read_and_clear_dirty: Walk IOMMU page tables for dirtied PTEs marshalled
* into a bitmap, with a bit represented as a page.
* Reads the dirty PTE bits and clears it from IO
* pagetables.
*/
struct iommu_dirty_ops {
int (*set_dirty_tracking)(struct iommu_domain *domain, bool enabled);
int (*read_and_clear_dirty)(struct iommu_domain *domain,
unsigned long iova, size_t size,
unsigned long flags,
struct iommu_dirty_bitmap *dirty);
};
/**
* struct iommu_user_data - iommu driver specific user space data info
* @type: The data type of the user buffer
* @uptr: Pointer to the user buffer for copy_from_user()
* @len: The length of the user buffer in bytes
*
* A user space data is an uAPI that is defined in include/uapi/linux/iommufd.h
* @type, @uptr and @len should be just copied from an iommufd core uAPI struct.
*/
struct iommu_user_data {
unsigned int type;
void __user *uptr;
size_t len;
};
/**
* __iommu_copy_struct_from_user - Copy iommu driver specific user space data
* @dst_data: Pointer to an iommu driver specific user data that is defined in
* include/uapi/linux/iommufd.h
* @src_data: Pointer to a struct iommu_user_data for user space data info
* @data_type: The data type of the @dst_data. Must match with @src_data.type
* @data_len: Length of current user data structure, i.e. sizeof(struct _dst)
* @min_len: Initial length of user data structure for backward compatibility.
* This should be offsetofend using the last member in the user data
* struct that was initially added to include/uapi/linux/iommufd.h
*/
static inline int __iommu_copy_struct_from_user(
void *dst_data, const struct iommu_user_data *src_data,
unsigned int data_type, size_t data_len, size_t min_len)
{
if (src_data->type != data_type)
return -EINVAL;
if (WARN_ON(!dst_data || !src_data))
return -EINVAL;
if (src_data->len < min_len || data_len < src_data->len)
return -EINVAL;
return copy_struct_from_user(dst_data, data_len, src_data->uptr,
src_data->len);
}
/**
* iommu_copy_struct_from_user - Copy iommu driver specific user space data
* @kdst: Pointer to an iommu driver specific user data that is defined in
* include/uapi/linux/iommufd.h
* @user_data: Pointer to a struct iommu_user_data for user space data info
* @data_type: The data type of the @kdst. Must match with @user_data->type
* @min_last: The last memember of the data structure @kdst points in the
* initial version.
* Return 0 for success, otherwise -error.
*/
#define iommu_copy_struct_from_user(kdst, user_data, data_type, min_last) \
__iommu_copy_struct_from_user(kdst, user_data, data_type, \
sizeof(*kdst), \
offsetofend(typeof(*kdst), min_last))
/**
* struct iommu_ops - iommu ops and capabilities
* @capable: check capability
......@@ -234,7 +327,19 @@ struct iommu_iotlb_gather {
* op is allocated in the iommu driver and freed by the caller after
* use. The information type is one of enum iommu_hw_info_type defined
* in include/uapi/linux/iommufd.h.
* @domain_alloc: allocate iommu domain
* @domain_alloc: allocate and return an iommu domain if success. Otherwise
* NULL is returned. The domain is not fully initialized until
* the caller iommu_domain_alloc() returns.
* @domain_alloc_user: Allocate an iommu domain corresponding to the input
* parameters as defined in include/uapi/linux/iommufd.h.
* Unlike @domain_alloc, it is called only by IOMMUFD and
* must fully initialize the new domain before return.
* Upon success, if the @user_data is valid and the @parent
* points to a kernel-managed domain, the new domain must be
* IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be
* NULL while the @user_data can be optionally provided, the
* new domain must support __IOMMU_DOMAIN_PAGING.
* Upon failure, ERR_PTR must be returned.
* @probe_device: Add device to iommu driver handling
* @release_device: Remove device from iommu driver handling
* @probe_finalize: Do final setup work after the device is added to an IOMMU
......@@ -267,6 +372,9 @@ struct iommu_ops {
/* Domain allocation and freeing by the iommu driver */
struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type);
struct iommu_domain *(*domain_alloc_user)(
struct device *dev, u32 flags, struct iommu_domain *parent,
const struct iommu_user_data *user_data);
struct iommu_device *(*probe_device)(struct device *dev);
void (*release_device)(struct device *dev);
......@@ -632,6 +740,28 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather)
return gather && gather->queued;
}
static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty,
struct iova_bitmap *bitmap,
struct iommu_iotlb_gather *gather)
{
if (gather)
iommu_iotlb_gather_init(gather);
dirty->bitmap = bitmap;
dirty->gather = gather;
}
static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty,
unsigned long iova,
unsigned long length)
{
if (dirty->bitmap)
iova_bitmap_set(dirty->bitmap, iova, length);
if (dirty->gather)
iommu_iotlb_gather_add_range(dirty->gather, iova, length);
}
/* PCI device grouping function */
extern struct iommu_group *pci_device_group(struct device *dev);
/* Generic device grouping function */
......@@ -737,6 +867,8 @@ struct iommu_fwspec {};
struct iommu_device {};
struct iommu_fault_param {};
struct iommu_iotlb_gather {};
struct iommu_dirty_bitmap {};
struct iommu_dirty_ops {};
static inline bool iommu_present(const struct bus_type *bus)
{
......@@ -969,6 +1101,18 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather)
return false;
}
static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty,
struct iova_bitmap *bitmap,
struct iommu_iotlb_gather *gather)
{
}
static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty,
unsigned long iova,
unsigned long length)
{
}
static inline void iommu_device_unregister(struct iommu_device *iommu)
{
}
......
......@@ -7,6 +7,7 @@
#define _IOVA_BITMAP_H_
#include <linux/types.h>
#include <linux/errno.h>
struct iova_bitmap;
......@@ -14,6 +15,7 @@ typedef int (*iova_bitmap_fn_t)(struct iova_bitmap *bitmap,
unsigned long iova, size_t length,
void *opaque);
#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER)
struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
unsigned long page_size,
u64 __user *data);
......@@ -22,5 +24,29 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
iova_bitmap_fn_t fn);
void iova_bitmap_set(struct iova_bitmap *bitmap,
unsigned long iova, size_t length);
#else
static inline struct iova_bitmap *iova_bitmap_alloc(unsigned long iova,
size_t length,
unsigned long page_size,
u64 __user *data)
{
return NULL;
}
static inline void iova_bitmap_free(struct iova_bitmap *bitmap)
{
}
static inline int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
iova_bitmap_fn_t fn)
{
return -EOPNOTSUPP;
}
static inline void iova_bitmap_set(struct iova_bitmap *bitmap,
unsigned long iova, size_t length)
{
}
#endif
#endif
......@@ -47,6 +47,8 @@ enum {
IOMMUFD_CMD_VFIO_IOAS,
IOMMUFD_CMD_HWPT_ALLOC,
IOMMUFD_CMD_GET_HW_INFO,
IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING,
IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP,
};
/**
......@@ -347,20 +349,86 @@ struct iommu_vfio_ioas {
};
#define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS)
/**
* enum iommufd_hwpt_alloc_flags - Flags for HWPT allocation
* @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a HWPT that can serve as
* the parent HWPT in a nesting configuration.
* @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is
* enforced on device attachment
*/
enum iommufd_hwpt_alloc_flags {
IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0,
IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1,
};
/**
* enum iommu_hwpt_vtd_s1_flags - Intel VT-d stage-1 page table
* entry attributes
* @IOMMU_VTD_S1_SRE: Supervisor request
* @IOMMU_VTD_S1_EAFE: Extended access enable
* @IOMMU_VTD_S1_WPE: Write protect enable
*/
enum iommu_hwpt_vtd_s1_flags {
IOMMU_VTD_S1_SRE = 1 << 0,
IOMMU_VTD_S1_EAFE = 1 << 1,
IOMMU_VTD_S1_WPE = 1 << 2,
};
/**
* struct iommu_hwpt_vtd_s1 - Intel VT-d stage-1 page table
* info (IOMMU_HWPT_DATA_VTD_S1)
* @flags: Combination of enum iommu_hwpt_vtd_s1_flags
* @pgtbl_addr: The base address of the stage-1 page table.
* @addr_width: The address width of the stage-1 page table
* @__reserved: Must be 0
*/
struct iommu_hwpt_vtd_s1 {
__aligned_u64 flags;
__aligned_u64 pgtbl_addr;
__u32 addr_width;
__u32 __reserved;
};
/**
* enum iommu_hwpt_data_type - IOMMU HWPT Data Type
* @IOMMU_HWPT_DATA_NONE: no data
* @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
*/
enum iommu_hwpt_data_type {
IOMMU_HWPT_DATA_NONE,
IOMMU_HWPT_DATA_VTD_S1,
};
/**
* struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC)
* @size: sizeof(struct iommu_hwpt_alloc)
* @flags: Must be 0
* @flags: Combination of enum iommufd_hwpt_alloc_flags
* @dev_id: The device to allocate this HWPT for
* @pt_id: The IOAS to connect this HWPT to
* @pt_id: The IOAS or HWPT to connect this HWPT to
* @out_hwpt_id: The ID of the new HWPT
* @__reserved: Must be 0
* @data_type: One of enum iommu_hwpt_data_type
* @data_len: Length of the type specific data
* @data_uptr: User pointer to the type specific data
*
* Explicitly allocate a hardware page table object. This is the same object
* type that is returned by iommufd_device_attach() and represents the
* underlying iommu driver's iommu_domain kernel object.
*
* A HWPT will be created with the IOVA mappings from the given IOAS.
* A kernel-managed HWPT will be created with the mappings from the given
* IOAS via the @pt_id. The @data_type for this allocation must be set to
* IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a
* nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags.
*
* A user-managed nested HWPT will be created from a given parent HWPT via
* @pt_id, in which the parent HWPT must be allocated previously via the
* same ioctl from a given IOAS (@pt_id). In this case, the @data_type
* must be set to a pre-defined type corresponding to an I/O page table
* type supported by the underlying IOMMU hardware.
*
* If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and
* @data_uptr should be zero. Otherwise, both @data_len and @data_uptr
* must be given.
*/
struct iommu_hwpt_alloc {
__u32 size;
......@@ -369,13 +437,26 @@ struct iommu_hwpt_alloc {
__u32 pt_id;
__u32 out_hwpt_id;
__u32 __reserved;
__u32 data_type;
__u32 data_len;
__aligned_u64 data_uptr;
};
#define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC)
/**
* enum iommu_hw_info_vtd_flags - Flags for VT-d hw_info
* @IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17: If set, disallow read-only mappings
* on a nested_parent domain.
* https://www.intel.com/content/www/us/en/content-details/772415/content-details.html
*/
enum iommu_hw_info_vtd_flags {
IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 = 1 << 0,
};
/**
* struct iommu_hw_info_vtd - Intel VT-d hardware information
*
* @flags: Must be 0
* @flags: Combination of enum iommu_hw_info_vtd_flags
* @__reserved: Must be 0
*
* @cap_reg: Value of Intel VT-d capability register defined in VT-d spec
......@@ -404,6 +485,20 @@ enum iommu_hw_info_type {
IOMMU_HW_INFO_TYPE_INTEL_VTD,
};
/**
* enum iommufd_hw_capabilities
* @IOMMU_HW_CAP_DIRTY_TRACKING: IOMMU hardware support for dirty tracking
* If available, it means the following APIs
* are supported:
*
* IOMMU_HWPT_GET_DIRTY_BITMAP
* IOMMU_HWPT_SET_DIRTY_TRACKING
*
*/
enum iommufd_hw_capabilities {
IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0,
};
/**
* struct iommu_hw_info - ioctl(IOMMU_GET_HW_INFO)
* @size: sizeof(struct iommu_hw_info)
......@@ -415,6 +510,8 @@ enum iommu_hw_info_type {
* the iommu type specific hardware information data
* @out_data_type: Output the iommu hardware info type as defined in the enum
* iommu_hw_info_type.
* @out_capabilities: Output the generic iommu capability info type as defined
* in the enum iommu_hw_capabilities.
* @__reserved: Must be 0
*
* Query an iommu type specific hardware information data from an iommu behind
......@@ -439,6 +536,81 @@ struct iommu_hw_info {
__aligned_u64 data_uptr;
__u32 out_data_type;
__u32 __reserved;
__aligned_u64 out_capabilities;
};
#define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO)
/*
* enum iommufd_hwpt_set_dirty_tracking_flags - Flags for steering dirty
* tracking
* @IOMMU_HWPT_DIRTY_TRACKING_ENABLE: Enable dirty tracking
*/
enum iommufd_hwpt_set_dirty_tracking_flags {
IOMMU_HWPT_DIRTY_TRACKING_ENABLE = 1,
};
/**
* struct iommu_hwpt_set_dirty_tracking - ioctl(IOMMU_HWPT_SET_DIRTY_TRACKING)
* @size: sizeof(struct iommu_hwpt_set_dirty_tracking)
* @flags: Combination of enum iommufd_hwpt_set_dirty_tracking_flags
* @hwpt_id: HW pagetable ID that represents the IOMMU domain
* @__reserved: Must be 0
*
* Toggle dirty tracking on an HW pagetable.
*/
struct iommu_hwpt_set_dirty_tracking {
__u32 size;
__u32 flags;
__u32 hwpt_id;
__u32 __reserved;
};
#define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \
IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING)
/**
* enum iommufd_hwpt_get_dirty_bitmap_flags - Flags for getting dirty bits
* @IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR: Just read the PTEs without clearing
* any dirty bits metadata. This flag
* can be passed in the expectation
* where the next operation is an unmap
* of the same IOVA range.
*
*/
enum iommufd_hwpt_get_dirty_bitmap_flags {
IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR = 1,
};
/**
* struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP)
* @size: sizeof(struct iommu_hwpt_get_dirty_bitmap)
* @hwpt_id: HW pagetable ID that represents the IOMMU domain
* @flags: Combination of enum iommufd_hwpt_get_dirty_bitmap_flags
* @__reserved: Must be 0
* @iova: base IOVA of the bitmap first bit
* @length: IOVA range size
* @page_size: page size granularity of each bit in the bitmap
* @data: bitmap where to set the dirty bits. The bitmap bits each
* represent a page_size which you deviate from an arbitrary iova.
*
* Checking a given IOVA is dirty:
*
* data[(iova / page_size) / 64] & (1ULL << ((iova / page_size) % 64))
*
* Walk the IOMMU pagetables for a given IOVA range to return a bitmap
* with the dirty IOVAs. In doing so it will also by default clear any
* dirty bit metadata set in the IOPTE.
*/
struct iommu_hwpt_get_dirty_bitmap {
__u32 size;
__u32 hwpt_id;
__u32 flags;
__u32 __reserved;
__aligned_u64 iova;
__aligned_u64 length;
__aligned_u64 page_size;
__aligned_u64 data;
};
#define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)
#endif
This diff is collapsed.
......@@ -105,7 +105,7 @@ static bool fail_nth_next(struct __test_metadata *_metadata,
/*
* This is just an arbitrary limit based on the current kernel
* situation. Changes in the kernel can dramtically change the number of
* situation. Changes in the kernel can dramatically change the number of
* required fault injection sites, so if this hits it doesn't
* necessarily mean a test failure, just that the limit has to be made
* bigger.
......@@ -612,10 +612,11 @@ TEST_FAIL_NTH(basic_fail_nth, device)
&idev_id))
return -1;
if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info)))
if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL))
return -1;
if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, &hwpt_id))
if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, &hwpt_id,
IOMMU_HWPT_DATA_NONE, 0, 0))
return -1;
if (_test_cmd_mock_domain_replace(self->fd, stdev_id, ioas_id2, NULL))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment