Commit f35f22cc authored by Joao Martins's avatar Joao Martins Committed by Jason Gunthorpe

iommu/vt-d: Access/Dirty bit support for SS domains

IOMMU advertises Access/Dirty bits for second-stage page table if the
extended capability DMAR register reports it (ECAP, mnemonic ECAP.SSADS).
The first stage table is compatible with CPU page table thus A/D bits are
implicitly supported. Relevant Intel IOMMU SDM ref for first stage table
"3.6.2 Accessed, Extended Accessed, and Dirty Flags" and second stage table
"3.7.2 Accessed and Dirty Flags".

First stage page table is enabled by default so it's allowed to set dirty
tracking and no control bits needed, it just returns 0. To use SSADS, set
bit 9 (SSADE) in the scalable-mode PASID table entry and flush the IOTLB
via pasid_flush_caches() following the manual. Relevant SDM refs:

"3.7.2 Accessed and Dirty Flags"
"6.5.3.3 Guidance to Software for Invalidations,
 Table 23. Guidance to Software for Invalidations"

PTE dirty bit is located in bit 9 and it's cached in the IOTLB so flush
IOTLB to make sure IOMMU attempts to set the dirty bit again. Note that
iommu_dirty_bitmap_record() will add the IOVA to iotlb_gather and thus the
caller of the iommu op will flush the IOTLB. Relevant manuals over the
hardware translation is chapter 6 with some special mention to:

"6.2.3.1 Scalable-Mode PASID-Table Entry Programming Considerations"
"6.2.4 IOTLB"

Select IOMMUFD_DRIVER only if IOMMUFD is enabled, given that IOMMU dirty
tracking requires IOMMUFD.

Link: https://lore.kernel.org/r/20231024135109.73787-13-joao.m.martins@oracle.comSigned-off-by: default avatarJoao Martins <joao.m.martins@oracle.com>
Reviewed-by: default avatarLu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: default avatarKevin Tian <kevin.tian@intel.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@nvidia.com>
parent 421a511a
...@@ -15,6 +15,7 @@ config INTEL_IOMMU ...@@ -15,6 +15,7 @@ config INTEL_IOMMU
select DMA_OPS select DMA_OPS
select IOMMU_API select IOMMU_API
select IOMMU_IOVA select IOMMU_IOVA
select IOMMUFD_DRIVER if IOMMUFD
select NEED_DMA_MAP_STATE select NEED_DMA_MAP_STATE
select DMAR_TABLE select DMAR_TABLE
select SWIOTLB select SWIOTLB
......
...@@ -300,6 +300,7 @@ static int iommu_skip_te_disable; ...@@ -300,6 +300,7 @@ static int iommu_skip_te_disable;
#define IDENTMAP_AZALIA 4 #define IDENTMAP_AZALIA 4
const struct iommu_ops intel_iommu_ops; const struct iommu_ops intel_iommu_ops;
const struct iommu_dirty_ops intel_dirty_ops;
static bool translation_pre_enabled(struct intel_iommu *iommu) static bool translation_pre_enabled(struct intel_iommu *iommu)
{ {
...@@ -4079,8 +4080,10 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags) ...@@ -4079,8 +4080,10 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags)
{ {
struct iommu_domain *domain; struct iommu_domain *domain;
struct intel_iommu *iommu; struct intel_iommu *iommu;
bool dirty_tracking;
if (flags & (~IOMMU_HWPT_ALLOC_NEST_PARENT)) if (flags &
(~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
return ERR_PTR(-EOPNOTSUPP); return ERR_PTR(-EOPNOTSUPP);
iommu = device_to_iommu(dev, NULL, NULL); iommu = device_to_iommu(dev, NULL, NULL);
...@@ -4090,6 +4093,10 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags) ...@@ -4090,6 +4093,10 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags)
if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && !ecap_nest(iommu->ecap)) if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && !ecap_nest(iommu->ecap))
return ERR_PTR(-EOPNOTSUPP); return ERR_PTR(-EOPNOTSUPP);
dirty_tracking = (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING);
if (dirty_tracking && !ssads_supported(iommu))
return ERR_PTR(-EOPNOTSUPP);
/* /*
* domain_alloc_user op needs to fully initialize a domain * domain_alloc_user op needs to fully initialize a domain
* before return, so uses iommu_domain_alloc() here for * before return, so uses iommu_domain_alloc() here for
...@@ -4098,6 +4105,15 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags) ...@@ -4098,6 +4105,15 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags)
domain = iommu_domain_alloc(dev->bus); domain = iommu_domain_alloc(dev->bus);
if (!domain) if (!domain)
domain = ERR_PTR(-ENOMEM); domain = ERR_PTR(-ENOMEM);
if (!IS_ERR(domain) && dirty_tracking) {
if (to_dmar_domain(domain)->use_first_level) {
iommu_domain_free(domain);
return ERR_PTR(-EOPNOTSUPP);
}
domain->dirty_ops = &intel_dirty_ops;
}
return domain; return domain;
} }
...@@ -4121,6 +4137,9 @@ static int prepare_domain_attach_device(struct iommu_domain *domain, ...@@ -4121,6 +4137,9 @@ static int prepare_domain_attach_device(struct iommu_domain *domain,
if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
return -EINVAL; return -EINVAL;
if (domain->dirty_ops && !ssads_supported(iommu))
return -EINVAL;
/* check if this iommu agaw is sufficient for max mapped address */ /* check if this iommu agaw is sufficient for max mapped address */
addr_width = agaw_to_width(iommu->agaw); addr_width = agaw_to_width(iommu->agaw);
if (addr_width > cap_mgaw(iommu->cap)) if (addr_width > cap_mgaw(iommu->cap))
...@@ -4375,6 +4394,8 @@ static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) ...@@ -4375,6 +4394,8 @@ static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
return dmar_platform_optin(); return dmar_platform_optin();
case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
return ecap_sc_support(info->iommu->ecap); return ecap_sc_support(info->iommu->ecap);
case IOMMU_CAP_DIRTY_TRACKING:
return ssads_supported(info->iommu);
default: default:
return false; return false;
} }
...@@ -4772,6 +4793,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, ...@@ -4772,6 +4793,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
return -EOPNOTSUPP; return -EOPNOTSUPP;
if (domain->dirty_ops)
return -EINVAL;
if (context_copied(iommu, info->bus, info->devfn)) if (context_copied(iommu, info->bus, info->devfn))
return -EBUSY; return -EBUSY;
...@@ -4830,6 +4854,83 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) ...@@ -4830,6 +4854,83 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
return vtd; return vtd;
} }
static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
bool enable)
{
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct device_domain_info *info;
int ret;
spin_lock(&dmar_domain->lock);
if (dmar_domain->dirty_tracking == enable)
goto out_unlock;
list_for_each_entry(info, &dmar_domain->devices, link) {
ret = intel_pasid_setup_dirty_tracking(info->iommu,
info->domain, info->dev,
IOMMU_NO_PASID, enable);
if (ret)
goto err_unwind;
}
dmar_domain->dirty_tracking = enable;
out_unlock:
spin_unlock(&dmar_domain->lock);
return 0;
err_unwind:
list_for_each_entry(info, &dmar_domain->devices, link)
intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
info->dev, IOMMU_NO_PASID,
dmar_domain->dirty_tracking);
spin_unlock(&dmar_domain->lock);
return ret;
}
static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
unsigned long iova, size_t size,
unsigned long flags,
struct iommu_dirty_bitmap *dirty)
{
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
unsigned long end = iova + size - 1;
unsigned long pgsize;
/*
* IOMMUFD core calls into a dirty tracking disabled domain without an
* IOVA bitmap set in order to clean dirty bits in all PTEs that might
* have occurred when we stopped dirty tracking. This ensures that we
* never inherit dirtied bits from a previous cycle.
*/
if (!dmar_domain->dirty_tracking && dirty->bitmap)
return -EINVAL;
do {
struct dma_pte *pte;
int lvl = 0;
pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
GFP_ATOMIC);
pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
if (!pte || !dma_pte_present(pte)) {
iova += pgsize;
continue;
}
if (dma_sl_pte_test_and_clear_dirty(pte, flags))
iommu_dirty_bitmap_record(dirty, iova, pgsize);
iova += pgsize;
} while (iova < end);
return 0;
}
const struct iommu_dirty_ops intel_dirty_ops = {
.set_dirty_tracking = intel_iommu_set_dirty_tracking,
.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
};
const struct iommu_ops intel_iommu_ops = { const struct iommu_ops intel_iommu_ops = {
.capable = intel_iommu_capable, .capable = intel_iommu_capable,
.hw_info = intel_iommu_hw_info, .hw_info = intel_iommu_hw_info,
......
...@@ -48,6 +48,9 @@ ...@@ -48,6 +48,9 @@
#define DMA_FL_PTE_DIRTY BIT_ULL(6) #define DMA_FL_PTE_DIRTY BIT_ULL(6)
#define DMA_FL_PTE_XD BIT_ULL(63) #define DMA_FL_PTE_XD BIT_ULL(63)
#define DMA_SL_PTE_DIRTY_BIT 9
#define DMA_SL_PTE_DIRTY BIT_ULL(DMA_SL_PTE_DIRTY_BIT)
#define ADDR_WIDTH_5LEVEL (57) #define ADDR_WIDTH_5LEVEL (57)
#define ADDR_WIDTH_4LEVEL (48) #define ADDR_WIDTH_4LEVEL (48)
...@@ -539,6 +542,8 @@ enum { ...@@ -539,6 +542,8 @@ enum {
#define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap)) #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap))
#define pasid_supported(iommu) (sm_supported(iommu) && \ #define pasid_supported(iommu) (sm_supported(iommu) && \
ecap_pasid((iommu)->ecap)) ecap_pasid((iommu)->ecap))
#define ssads_supported(iommu) (sm_supported(iommu) && \
ecap_slads((iommu)->ecap))
struct pasid_entry; struct pasid_entry;
struct pasid_state_entry; struct pasid_state_entry;
...@@ -592,6 +597,7 @@ struct dmar_domain { ...@@ -592,6 +597,7 @@ struct dmar_domain {
* otherwise, goes through the second * otherwise, goes through the second
* level. * level.
*/ */
u8 dirty_tracking:1; /* Dirty tracking is enabled */
spinlock_t lock; /* Protect device tracking lists */ spinlock_t lock; /* Protect device tracking lists */
struct list_head devices; /* all devices' list */ struct list_head devices; /* all devices' list */
...@@ -781,6 +787,16 @@ static inline bool dma_pte_present(struct dma_pte *pte) ...@@ -781,6 +787,16 @@ static inline bool dma_pte_present(struct dma_pte *pte)
return (pte->val & 3) != 0; return (pte->val & 3) != 0;
} }
static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
unsigned long flags)
{
if (flags & IOMMU_DIRTY_NO_CLEAR)
return (pte->val & DMA_SL_PTE_DIRTY) != 0;
return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT,
(unsigned long *)&pte->val);
}
static inline bool dma_pte_superpage(struct dma_pte *pte) static inline bool dma_pte_superpage(struct dma_pte *pte)
{ {
return (pte->val & DMA_PTE_LARGE_PAGE); return (pte->val & DMA_PTE_LARGE_PAGE);
......
...@@ -277,6 +277,11 @@ static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits) ...@@ -277,6 +277,11 @@ static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits)
WRITE_ONCE(*ptr, (old & ~mask) | bits); WRITE_ONCE(*ptr, (old & ~mask) | bits);
} }
static inline u64 pasid_get_bits(u64 *ptr)
{
return READ_ONCE(*ptr);
}
/* /*
* Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode
* PASID entry. * PASID entry.
...@@ -335,6 +340,36 @@ static inline void pasid_set_fault_enable(struct pasid_entry *pe) ...@@ -335,6 +340,36 @@ static inline void pasid_set_fault_enable(struct pasid_entry *pe)
pasid_set_bits(&pe->val[0], 1 << 1, 0); pasid_set_bits(&pe->val[0], 1 << 1, 0);
} }
/*
* Enable second level A/D bits by setting the SLADE (Second Level
* Access Dirty Enable) field (Bit 9) of a scalable mode PASID
* entry.
*/
static inline void pasid_set_ssade(struct pasid_entry *pe)
{
pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9);
}
/*
* Disable second level A/D bits by clearing the SLADE (Second Level
* Access Dirty Enable) field (Bit 9) of a scalable mode PASID
* entry.
*/
static inline void pasid_clear_ssade(struct pasid_entry *pe)
{
pasid_set_bits(&pe->val[0], 1 << 9, 0);
}
/*
* Checks if second level A/D bits specifically the SLADE (Second Level
* Access Dirty Enable) field (Bit 9) of a scalable mode PASID
* entry is set.
*/
static inline bool pasid_get_ssade(struct pasid_entry *pe)
{
return pasid_get_bits(&pe->val[0]) & (1 << 9);
}
/* /*
* Setup the WPE(Write Protect Enable) field (Bit 132) of a * Setup the WPE(Write Protect Enable) field (Bit 132) of a
* scalable mode PASID entry. * scalable mode PASID entry.
...@@ -627,6 +662,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, ...@@ -627,6 +662,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
pasid_set_fault_enable(pte); pasid_set_fault_enable(pte);
pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
if (domain->dirty_tracking)
pasid_set_ssade(pte);
pasid_set_present(pte); pasid_set_present(pte);
spin_unlock(&iommu->lock); spin_unlock(&iommu->lock);
...@@ -636,6 +673,78 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, ...@@ -636,6 +673,78 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
return 0; return 0;
} }
/*
* Set up dirty tracking on a second only or nested translation type.
*/
int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid,
bool enabled)
{
struct pasid_entry *pte;
u16 did, pgtt;
spin_lock(&iommu->lock);
pte = intel_pasid_get_entry(dev, pasid);
if (!pte) {
spin_unlock(&iommu->lock);
dev_err_ratelimited(
dev, "Failed to get pasid entry of PASID %d\n", pasid);
return -ENODEV;
}
did = domain_id_iommu(domain, iommu);
pgtt = pasid_pte_get_pgtt(pte);
if (pgtt != PASID_ENTRY_PGTT_SL_ONLY &&
pgtt != PASID_ENTRY_PGTT_NESTED) {
spin_unlock(&iommu->lock);
dev_err_ratelimited(
dev,
"Dirty tracking not supported on translation type %d\n",
pgtt);
return -EOPNOTSUPP;
}
if (pasid_get_ssade(pte) == enabled) {
spin_unlock(&iommu->lock);
return 0;
}
if (enabled)
pasid_set_ssade(pte);
else
pasid_clear_ssade(pte);
spin_unlock(&iommu->lock);
if (!ecap_coherent(iommu->ecap))
clflush_cache_range(pte, sizeof(*pte));
/*
* From VT-d spec table 25 "Guidance to Software for Invalidations":
*
* - PASID-selective-within-Domain PASID-cache invalidation
* If (PGTT=SS or Nested)
* - Domain-selective IOTLB invalidation
* Else
* - PASID-selective PASID-based IOTLB invalidation
* - If (pasid is RID_PASID)
* - Global Device-TLB invalidation to affected functions
* Else
* - PASID-based Device-TLB invalidation (with S=1 and
* Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions
*/
pasid_cache_invalidation_with_pasid(iommu, did, pasid);
iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
/* Device IOTLB doesn't need to be flushed in caching mode. */
if (!cap_caching_mode(iommu->cap))
devtlb_invalidation_with_pasid(iommu, dev, pasid);
return 0;
}
/* /*
* Set up the scalable mode pasid entry for passthrough translation type. * Set up the scalable mode pasid entry for passthrough translation type.
*/ */
......
...@@ -106,6 +106,10 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, ...@@ -106,6 +106,10 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
int intel_pasid_setup_second_level(struct intel_iommu *iommu, int intel_pasid_setup_second_level(struct intel_iommu *iommu,
struct dmar_domain *domain, struct dmar_domain *domain,
struct device *dev, u32 pasid); struct device *dev, u32 pasid);
int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid,
bool enabled);
int intel_pasid_setup_pass_through(struct intel_iommu *iommu, int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
struct dmar_domain *domain, struct dmar_domain *domain,
struct device *dev, u32 pasid); struct device *dev, u32 pasid);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment