Commit 3689c37d authored by Mark Hairgrove's avatar Mark Hairgrove Committed by Michael Ellerman

powerpc/powernv/npu: Use size-based ATSD invalidates

Prior to this change only two types of ATSDs were issued to the NPU:
invalidates targeting a single page and invalidates targeting the whole
address space. The crossover point happened at the configurable
atsd_threshold which defaulted to 2M. Invalidates that size or smaller
would issue per-page invalidates for the whole range.

The NPU supports more invalidation sizes however: 64K, 2M, 1G, and all.
These invalidates target addresses aligned to their size. 2M is a common
invalidation size for GPU-enabled applications because that is a GPU
page size, so reducing the number of invalidates by 32x in that case is a
clear improvement.

ATSD latency is high in general so now we always issue a single invalidate
rather than multiple. This will over-invalidate in some cases, but for any
invalidation size over 2M it matches or improves the prior behavior.
There's also an improvement for single-page invalidates since the prior
version issued two invalidates for that case instead of one.

With this change all issued ATSDs now perform a flush, so the flush
parameter has been removed from all the helpers.

To show the benefit here are some performance numbers from a
microbenchmark which creates a 1G allocation then uses mprotect with
PROT_NONE to trigger invalidates in strides across the allocation.

One NPU (1 GPU):

         mprotect rate (GB/s)
Stride   Before      After      Speedup
64K         5.3        5.6           5%
1M         39.3       57.4          46%
2M         49.7       82.6          66%
4M        286.6      285.7           0%

Two NPUs (6 GPUs):

         mprotect rate (GB/s)
Stride   Before      After      Speedup
64K         6.5        7.4          13%
1M         33.4       67.9         103%
2M         38.7       93.1         141%
4M        356.7      354.6          -1%

Anything over 2M is roughly the same as before since both cases issue a
single ATSD.
Signed-off-by: default avatarMark Hairgrove <mhairgrove@nvidia.com>
Reviewed-By: default avatarAlistair Popple <alistair@popple.id.au>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
parent 7ead15a1
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <linux/memblock.h> #include <linux/memblock.h>
#include <linux/iommu.h> #include <linux/iommu.h>
#include <linux/debugfs.h> #include <linux/debugfs.h>
#include <linux/sizes.h>
#include <asm/debugfs.h> #include <asm/debugfs.h>
#include <asm/tlb.h> #include <asm/tlb.h>
...@@ -458,8 +459,7 @@ static void put_mmio_atsd_reg(struct npu *npu, int reg) ...@@ -458,8 +459,7 @@ static void put_mmio_atsd_reg(struct npu *npu, int reg)
#define XTS_ATSD_AVA 1 #define XTS_ATSD_AVA 1
#define XTS_ATSD_STAT 2 #define XTS_ATSD_STAT 2
static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize, static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize)
bool flush)
{ {
unsigned long launch = 0; unsigned long launch = 0;
...@@ -477,8 +477,7 @@ static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize, ...@@ -477,8 +477,7 @@ static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize,
/* PID */ /* PID */
launch |= pid << PPC_BITLSHIFT(38); launch |= pid << PPC_BITLSHIFT(38);
/* No flush */ /* Leave "No flush" (bit 39) 0 so every ATSD performs a flush */
launch |= !flush << PPC_BITLSHIFT(39);
return launch; return launch;
} }
...@@ -501,23 +500,22 @@ static void mmio_atsd_regs_write(struct mmio_atsd_reg ...@@ -501,23 +500,22 @@ static void mmio_atsd_regs_write(struct mmio_atsd_reg
} }
static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
unsigned long pid, bool flush) unsigned long pid)
{ {
unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT, flush); unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT);
/* Invalidating the entire process doesn't use a va */ /* Invalidating the entire process doesn't use a va */
mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch); mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
} }
static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], static void mmio_invalidate_range(struct mmio_atsd_reg
unsigned long va, unsigned long pid, bool flush) mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid,
unsigned long start, unsigned long psize)
{ {
unsigned long launch; unsigned long launch = get_atsd_launch_val(pid, psize);
launch = get_atsd_launch_val(pid, mmu_virtual_psize, flush);
/* Write all VAs first */ /* Write all VAs first */
mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, va); mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, start);
/* Issue one barrier for all address writes */ /* Issue one barrier for all address writes */
eieio(); eieio();
...@@ -609,14 +607,36 @@ static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]) ...@@ -609,14 +607,36 @@ static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
} }
/* /*
* Invalidate either a single address or an entire PID depending on * Invalidate a virtual address range
* the value of va.
*/ */
static void mmio_invalidate(struct npu_context *npu_context, int va, static void mmio_invalidate(struct npu_context *npu_context,
unsigned long address, bool flush) unsigned long start, unsigned long size)
{ {
struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]; struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
unsigned long pid = npu_context->mm->context.id; unsigned long pid = npu_context->mm->context.id;
unsigned long atsd_start = 0;
unsigned long end = start + size - 1;
int atsd_psize = MMU_PAGE_COUNT;
/*
* Convert the input range into one of the supported sizes. If the range
* doesn't fit, use the next larger supported size. Invalidation latency
* is high, so over-invalidation is preferred to issuing multiple
* invalidates.
*
* A 4K page size isn't supported by NPU/GPU ATS, so that case is
* ignored.
*/
if (size == SZ_64K) {
atsd_start = start;
atsd_psize = MMU_PAGE_64K;
} else if (ALIGN_DOWN(start, SZ_2M) == ALIGN_DOWN(end, SZ_2M)) {
atsd_start = ALIGN_DOWN(start, SZ_2M);
atsd_psize = MMU_PAGE_2M;
} else if (ALIGN_DOWN(start, SZ_1G) == ALIGN_DOWN(end, SZ_1G)) {
atsd_start = ALIGN_DOWN(start, SZ_1G);
atsd_psize = MMU_PAGE_1G;
}
if (npu_context->nmmu_flush) if (npu_context->nmmu_flush)
/* /*
...@@ -631,23 +651,25 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, ...@@ -631,23 +651,25 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
* an invalidate. * an invalidate.
*/ */
acquire_atsd_reg(npu_context, mmio_atsd_reg); acquire_atsd_reg(npu_context, mmio_atsd_reg);
if (va)
mmio_invalidate_va(mmio_atsd_reg, address, pid, flush); if (atsd_psize == MMU_PAGE_COUNT)
mmio_invalidate_pid(mmio_atsd_reg, pid);
else else
mmio_invalidate_pid(mmio_atsd_reg, pid, flush); mmio_invalidate_range(mmio_atsd_reg, pid, atsd_start,
atsd_psize);
mmio_invalidate_wait(mmio_atsd_reg); mmio_invalidate_wait(mmio_atsd_reg);
if (flush) {
/* /*
* The GPU requires two flush ATSDs to ensure all entries have * The GPU requires two flush ATSDs to ensure all entries have been
* been flushed. We use PID 0 as it will never be used for a * flushed. We use PID 0 as it will never be used for a process on the
* process on the GPU. * GPU.
*/ */
mmio_invalidate_pid(mmio_atsd_reg, 0, true); mmio_invalidate_pid(mmio_atsd_reg, 0);
mmio_invalidate_wait(mmio_atsd_reg); mmio_invalidate_wait(mmio_atsd_reg);
mmio_invalidate_pid(mmio_atsd_reg, 0, true); mmio_invalidate_pid(mmio_atsd_reg, 0);
mmio_invalidate_wait(mmio_atsd_reg); mmio_invalidate_wait(mmio_atsd_reg);
}
release_atsd_reg(mmio_atsd_reg); release_atsd_reg(mmio_atsd_reg);
} }
...@@ -664,7 +686,7 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn, ...@@ -664,7 +686,7 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn,
* There should be no more translation requests for this PID, but we * There should be no more translation requests for this PID, but we
* need to ensure any entries for it are removed from the TLB. * need to ensure any entries for it are removed from the TLB.
*/ */
mmio_invalidate(npu_context, 0, 0, true); mmio_invalidate(npu_context, 0, ~0UL);
} }
static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
...@@ -673,8 +695,7 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, ...@@ -673,8 +695,7 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
pte_t pte) pte_t pte)
{ {
struct npu_context *npu_context = mn_to_npu_context(mn); struct npu_context *npu_context = mn_to_npu_context(mn);
mmio_invalidate(npu_context, address, PAGE_SIZE);
mmio_invalidate(npu_context, 1, address, true);
} }
static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
...@@ -682,21 +703,7 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, ...@@ -682,21 +703,7 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
unsigned long start, unsigned long end) unsigned long start, unsigned long end)
{ {
struct npu_context *npu_context = mn_to_npu_context(mn); struct npu_context *npu_context = mn_to_npu_context(mn);
unsigned long address; mmio_invalidate(npu_context, start, end - start);
if (end - start > atsd_threshold) {
/*
* Just invalidate the entire PID if the address range is too
* large.
*/
mmio_invalidate(npu_context, 0, 0, true);
} else {
for (address = start; address < end; address += PAGE_SIZE)
mmio_invalidate(npu_context, 1, address, false);
/* Do the flush only on the final addess == end */
mmio_invalidate(npu_context, 1, address, true);
}
} }
static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment