Commit 201ed7f3 authored by Alexey Kardashevskiy's avatar Alexey Kardashevskiy Committed by Michael Ellerman

powerpc/powernv/ioda2: Create bigger default window with 64k IOMMU pages

At the moment we create a small window only for 32bit devices, the window
maps 0..2GB of the PCI space only. For other devices we either use
a sketchy bypass or hardware bypass but the former can only work if
the amount of RAM is no bigger than the device's DMA mask and the latter
requires devices to support at least 59bit DMA.

This extends the default DMA window to the maximum size possible to allow
a wider DMA mask than just 32bit. The default window size is now limited
by the the iommu_table::it_map allocation bitmap which is a contiguous
array, 1 bit per an IOMMU page.

This increases the default IOMMU page size from hard coded 4K to
the system page size to allow wider DMA masks.

This increases the level number to not exceed the max order allocation
limit per TCE level. By the same time, this keeps minimal levels number
as 2 in order to save memory.

As the extended window now overlaps the 32bit MMIO region, this adds
an area reservation to iommu_init_table().

After this change the default window size is 0x80000000000==1<<43 so
devices limited to DMA mask smaller than the amount of system RAM can
still use more than just 2GB of memory for DMA.

This is an optimization and not a bug fix for DMA API usage.

With the on-demand allocation of indirect TCE table levels enabled and
2 levels, the first TCE level size is just
1<<ceil((log2(0x7ffffffffff+1)-16)/2)=16384 TCEs or 2 system pages.
Signed-off-by: default avatarAlexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190718051139.74787-5-aik@ozlabs.ru
parent c37c792d
...@@ -111,6 +111,8 @@ struct iommu_table { ...@@ -111,6 +111,8 @@ struct iommu_table {
struct iommu_table_ops *it_ops; struct iommu_table_ops *it_ops;
struct kref it_kref; struct kref it_kref;
int it_nid; int it_nid;
unsigned long it_reserved_start; /* Start of not-DMA-able (MMIO) area */
unsigned long it_reserved_end;
}; };
#define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \ #define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
...@@ -149,8 +151,9 @@ extern int iommu_tce_table_put(struct iommu_table *tbl); ...@@ -149,8 +151,9 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
/* Initializes an iommu_table based in values set in the passed-in /* Initializes an iommu_table based in values set in the passed-in
* structure * structure
*/ */
extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, extern struct iommu_table *iommu_init_table(struct iommu_table *tbl,
int nid); int nid, unsigned long res_start, unsigned long res_end);
#define IOMMU_TABLE_GROUP_MAX_TABLES 2 #define IOMMU_TABLE_GROUP_MAX_TABLES 2
struct iommu_table_group; struct iommu_table_group;
......
...@@ -633,11 +633,54 @@ static void iommu_table_clear(struct iommu_table *tbl) ...@@ -633,11 +633,54 @@ static void iommu_table_clear(struct iommu_table *tbl)
#endif #endif
} }
static void iommu_table_reserve_pages(struct iommu_table *tbl,
unsigned long res_start, unsigned long res_end)
{
int i;
WARN_ON_ONCE(res_end < res_start);
/*
* Reserve page 0 so it will not be used for any mappings.
* This avoids buggy drivers that consider page 0 to be invalid
* to crash the machine or even lose data.
*/
if (tbl->it_offset == 0)
set_bit(0, tbl->it_map);
tbl->it_reserved_start = res_start;
tbl->it_reserved_end = res_end;
/* Check if res_start..res_end isn't empty and overlaps the table */
if (res_start && res_end &&
(tbl->it_offset + tbl->it_size < res_start ||
res_end < tbl->it_offset))
return;
for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
set_bit(i - tbl->it_offset, tbl->it_map);
}
static void iommu_table_release_pages(struct iommu_table *tbl)
{
int i;
/*
* In case we have reserved the first bit, we should not emit
* the warning below.
*/
if (tbl->it_offset == 0)
clear_bit(0, tbl->it_map);
for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
clear_bit(i - tbl->it_offset, tbl->it_map);
}
/* /*
* Build a iommu_table structure. This contains a bit map which * Build a iommu_table structure. This contains a bit map which
* is used to manage allocation of the tce space. * is used to manage allocation of the tce space.
*/ */
struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
unsigned long res_start, unsigned long res_end)
{ {
unsigned long sz; unsigned long sz;
static int welcomed = 0; static int welcomed = 0;
...@@ -656,13 +699,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) ...@@ -656,13 +699,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
tbl->it_map = page_address(page); tbl->it_map = page_address(page);
memset(tbl->it_map, 0, sz); memset(tbl->it_map, 0, sz);
/* iommu_table_reserve_pages(tbl, res_start, res_end);
* Reserve page 0 so it will not be used for any mappings.
* This avoids buggy drivers that consider page 0 to be invalid
* to crash the machine or even lose data.
*/
if (tbl->it_offset == 0)
set_bit(0, tbl->it_map);
/* We only split the IOMMU table if we have 1GB or more of space */ /* We only split the IOMMU table if we have 1GB or more of space */
if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024)) if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
...@@ -714,12 +751,7 @@ static void iommu_table_free(struct kref *kref) ...@@ -714,12 +751,7 @@ static void iommu_table_free(struct kref *kref)
return; return;
} }
/* iommu_table_release_pages(tbl);
* In case we have reserved the first bit, we should not emit
* the warning below.
*/
if (tbl->it_offset == 0)
clear_bit(0, tbl->it_map);
/* verify that table contains no entries */ /* verify that table contains no entries */
if (!bitmap_empty(tbl->it_map, tbl->it_size)) if (!bitmap_empty(tbl->it_map, tbl->it_size))
...@@ -1024,15 +1056,14 @@ int iommu_take_ownership(struct iommu_table *tbl) ...@@ -1024,15 +1056,14 @@ int iommu_take_ownership(struct iommu_table *tbl)
for (i = 0; i < tbl->nr_pools; i++) for (i = 0; i < tbl->nr_pools; i++)
spin_lock(&tbl->pools[i].lock); spin_lock(&tbl->pools[i].lock);
if (tbl->it_offset == 0) iommu_table_release_pages(tbl);
clear_bit(0, tbl->it_map);
if (!bitmap_empty(tbl->it_map, tbl->it_size)) { if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
pr_err("iommu_tce: it_map is not empty"); pr_err("iommu_tce: it_map is not empty");
ret = -EBUSY; ret = -EBUSY;
/* Restore bit#0 set by iommu_init_table() */ /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */
if (tbl->it_offset == 0) iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
set_bit(0, tbl->it_map); tbl->it_reserved_end);
} else { } else {
memset(tbl->it_map, 0xff, sz); memset(tbl->it_map, 0xff, sz);
} }
...@@ -1055,9 +1086,8 @@ void iommu_release_ownership(struct iommu_table *tbl) ...@@ -1055,9 +1086,8 @@ void iommu_release_ownership(struct iommu_table *tbl)
memset(tbl->it_map, 0, sz); memset(tbl->it_map, 0, sz);
/* Restore bit#0 set by iommu_init_table() */ iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
if (tbl->it_offset == 0) tbl->it_reserved_end);
set_bit(0, tbl->it_map);
for (i = 0; i < tbl->nr_pools; i++) for (i = 0; i < tbl->nr_pools; i++)
spin_unlock(&tbl->pools[i].lock); spin_unlock(&tbl->pools[i].lock);
......
...@@ -486,7 +486,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np, ...@@ -486,7 +486,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
window->table.it_size = size >> window->table.it_page_shift; window->table.it_size = size >> window->table.it_page_shift;
window->table.it_ops = &cell_iommu_ops; window->table.it_ops = &cell_iommu_ops;
iommu_init_table(&window->table, iommu->nid); iommu_init_table(&window->table, iommu->nid, 0, 0);
pr_debug("\tioid %d\n", window->ioid); pr_debug("\tioid %d\n", window->ioid);
pr_debug("\tblocksize %ld\n", window->table.it_blocksize); pr_debug("\tblocksize %ld\n", window->table.it_blocksize);
......
...@@ -146,7 +146,7 @@ static void iommu_table_iobmap_setup(void) ...@@ -146,7 +146,7 @@ static void iommu_table_iobmap_setup(void)
*/ */
iommu_table_iobmap.it_blocksize = 4; iommu_table_iobmap.it_blocksize = 4;
iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops; iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops;
iommu_init_table(&iommu_table_iobmap, 0); iommu_init_table(&iommu_table_iobmap, 0, 0, 0);
pr_debug(" <- %s\n", __func__); pr_debug(" <- %s\n", __func__);
} }
......
...@@ -2303,7 +2303,7 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, ...@@ -2303,7 +2303,7 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
tbl->it_ops = &pnv_ioda1_iommu_ops; tbl->it_ops = &pnv_ioda1_iommu_ops;
pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift; pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
iommu_init_table(tbl, phb->hose->node); iommu_init_table(tbl, phb->hose->node, 0, 0);
if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
pnv_ioda_setup_bus_dma(pe, pe->pbus); pnv_ioda_setup_bus_dma(pe, pe->pbus);
...@@ -2420,6 +2420,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) ...@@ -2420,6 +2420,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
{ {
struct iommu_table *tbl = NULL; struct iommu_table *tbl = NULL;
long rc; long rc;
unsigned long res_start, res_end;
/* /*
* crashkernel= specifies the kdump kernel's maximum memory at * crashkernel= specifies the kdump kernel's maximum memory at
...@@ -2433,19 +2434,46 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) ...@@ -2433,19 +2434,46 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
* DMA window can be larger than available memory, which will * DMA window can be larger than available memory, which will
* cause errors later. * cause errors later.
*/ */
const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory); const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1);
rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, /*
IOMMU_PAGE_SHIFT_4K, * We create the default window as big as we can. The constraint is
window_size, * the max order of allocation possible. The TCE table is likely to
POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl); * end up being multilevel and with on-demand allocation in place,
* the initial use is not going to be huge as the default window aims
* to support crippled devices (i.e. not fully 64bit DMAble) only.
*/
/* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */
const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory);
/* Each TCE level cannot exceed maxblock so go multilevel if needed */
unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT);
unsigned long tcelevel_order = ilog2(maxblock >> 3);
unsigned int levels = tces_order / tcelevel_order;
if (tces_order % tcelevel_order)
levels += 1;
/*
* We try to stick to default levels (which is >1 at the moment) in
* order to save memory by relying on on-demain TCE level allocation.
*/
levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS);
rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT,
window_size, levels, false, &tbl);
if (rc) { if (rc) {
pe_err(pe, "Failed to create 32-bit TCE table, err %ld", pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
rc); rc);
return rc; return rc;
} }
iommu_init_table(tbl, pe->phb->hose->node); /* We use top part of 32bit space for MMIO so exclude it from DMA */
res_start = 0;
res_end = 0;
if (window_size > pe->phb->ioda.m32_pci_base) {
res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
}
iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
if (rc) { if (rc) {
......
...@@ -609,7 +609,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) ...@@ -609,7 +609,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
iommu_table_setparms(pci->phb, dn, tbl); iommu_table_setparms(pci->phb, dn, tbl);
tbl->it_ops = &iommu_table_pseries_ops; tbl->it_ops = &iommu_table_pseries_ops;
iommu_init_table(tbl, pci->phb->node); iommu_init_table(tbl, pci->phb->node, 0, 0);
/* Divide the rest (1.75GB) among the children */ /* Divide the rest (1.75GB) among the children */
pci->phb->dma_window_size = 0x80000000ul; pci->phb->dma_window_size = 0x80000000ul;
...@@ -690,7 +690,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) ...@@ -690,7 +690,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
iommu_table_setparms_lpar(ppci->phb, pdn, tbl, iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
ppci->table_group, dma_window); ppci->table_group, dma_window);
tbl->it_ops = &iommu_table_lpar_multi_ops; tbl->it_ops = &iommu_table_lpar_multi_ops;
iommu_init_table(tbl, ppci->phb->node); iommu_init_table(tbl, ppci->phb->node, 0, 0);
iommu_register_group(ppci->table_group, iommu_register_group(ppci->table_group,
pci_domain_nr(bus), 0); pci_domain_nr(bus), 0);
pr_debug(" created table: %p\n", ppci->table_group); pr_debug(" created table: %p\n", ppci->table_group);
...@@ -719,7 +719,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) ...@@ -719,7 +719,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
tbl = PCI_DN(dn)->table_group->tables[0]; tbl = PCI_DN(dn)->table_group->tables[0];
iommu_table_setparms(phb, dn, tbl); iommu_table_setparms(phb, dn, tbl);
tbl->it_ops = &iommu_table_pseries_ops; tbl->it_ops = &iommu_table_pseries_ops;
iommu_init_table(tbl, phb->node); iommu_init_table(tbl, phb->node, 0, 0);
set_iommu_table_base(&dev->dev, tbl); set_iommu_table_base(&dev->dev, tbl);
return; return;
} }
...@@ -1169,7 +1169,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) ...@@ -1169,7 +1169,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
iommu_table_setparms_lpar(pci->phb, pdn, tbl, iommu_table_setparms_lpar(pci->phb, pdn, tbl,
pci->table_group, dma_window); pci->table_group, dma_window);
tbl->it_ops = &iommu_table_lpar_multi_ops; tbl->it_ops = &iommu_table_lpar_multi_ops;
iommu_init_table(tbl, pci->phb->node); iommu_init_table(tbl, pci->phb->node, 0, 0);
iommu_register_group(pci->table_group, iommu_register_group(pci->table_group,
pci_domain_nr(pci->phb->bus), 0); pci_domain_nr(pci->phb->bus), 0);
pr_debug(" created table: %p\n", pci->table_group); pr_debug(" created table: %p\n", pci->table_group);
......
...@@ -1191,7 +1191,7 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) ...@@ -1191,7 +1191,7 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
else else
tbl->it_ops = &iommu_table_pseries_ops; tbl->it_ops = &iommu_table_pseries_ops;
return iommu_init_table(tbl, -1); return iommu_init_table(tbl, -1, 0, 0);
} }
/** /**
......
...@@ -344,7 +344,7 @@ static void iommu_table_dart_setup(void) ...@@ -344,7 +344,7 @@ static void iommu_table_dart_setup(void)
iommu_table_dart.it_index = 0; iommu_table_dart.it_index = 0;
iommu_table_dart.it_blocksize = 1; iommu_table_dart.it_blocksize = 1;
iommu_table_dart.it_ops = &iommu_dart_ops; iommu_table_dart.it_ops = &iommu_dart_ops;
iommu_init_table(&iommu_table_dart, -1); iommu_init_table(&iommu_table_dart, -1, 0, 0);
/* Reserve the last page of the DART to avoid possible prefetch /* Reserve the last page of the DART to avoid possible prefetch
* past the DART mapped area * past the DART mapped area
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment