Merge tag 'x86_mm_for_v6.0_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Borislav Petkov: - Rename a PKRU macro to make more sense when reading the code - Update pkeys documentation - Avoid reading contended mm's TLB generation var if not absolutely necessary along with fixing a case where arch_tlbbatch_flush() doesn't adhere to the generation scheme and thus violates the conditions for the above avoidance. * tag 'x86_mm_for_v6.0_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mm/tlb: Ignore f->new_tlb_gen when zero x86/pkeys: Clarify PKRU_AD_KEY macro Documentation/protection-keys: Clean up documentation for User Space pkeys x86/mm/tlb: Avoid reading mm_tlb_gen when possible

Merge tag 'x86_mm_for_v6.0_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Borislav Petkov: - Rename a PKRU macro to make more sense when reading the code - Update pkeys documentation - Avoid reading contended mm's TLB generation var if not absolutely necessary along with fixing a case where arch_tlbbatch_flush() doesn't adhere to the generation scheme and thus violates the conditions for the above avoidance. * tag 'x86_mm_for_v6.0_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mm/tlb: Ignore f->new_tlb_gen when zero x86/pkeys: Clarify PKRU_AD_KEY macro Documentation/protection-keys: Clean up documentation for User Space pkeys x86/mm/tlb: Avoid reading mm_tlb_gen when possible
92598ae2 · Linus Torvalds · 94e37e84 · 8f1d56f6 · 92598ae2 · 92598ae2
Commit 92598ae2 authored Aug 01, 2022 by Linus Torvalds
4 changed files
--- a/Documentation/core-api/protection-keys.rst
+++ b/Documentation/core-api/protection-keys.rst
@@ -4,31 +4,29 @@
 Memory Protection Keys
 ======================
-Memory Protection Keys for Userspace (PKU aka PKEYs) is a feature
+Memory Protection Keys provide a mechanism for enforcing page-based
-which is found on Intel's Skylake (and later) "Scalable Processor"
+protections, but without requiring modification of the page tables when an
-Server CPUs. It will be available in future non-server Intel parts
+application changes protection domains.
-and future AMD processors.
+Pkeys Userspace (PKU) is a feature which can be found on:
-For anyone wishing to test or use this feature, it is available in
+        * Intel server CPUs, Skylake and later
-Amazon's EC2 C5 instances and is known to work there using an Ubuntu
+        * Intel client CPUs, Tiger Lake (11th Gen Core) and later
-17.04 image.
+        * Future AMD CPUs
-Memory Protection Keys provides a mechanism for enforcing page-based
+Pkeys work by dedicating 4 previously Reserved bits in each page table entry to
-protections, but without requiring modification of the page tables
+a "protection key", giving 16 possible keys.
-when an application changes protection domains.  It works by
-dedicating 4 previously ignored bits in each page table entry to a
+Protections for each key are defined with a per-CPU user-accessible register
-"protection key", giving 16 possible keys.
+(PKRU).  Each of these is a 32-bit register storing two bits (Access Disable
+and Write Disable) for each of 16 keys.
-There is also a new user-accessible register (PKRU) with two separate
-bits (Access Disable and Write Disable) for each key.  Being a CPU
+Being a CPU register, PKRU is inherently thread-local, potentially giving each
-register, PKRU is inherently thread-local, potentially giving each
 thread a different set of protections from every other thread.
-There are two new instructions (RDPKRU/WRPKRU) for reading and writing
+There are two instructions (RDPKRU/WRPKRU) for reading and writing to the
-to the new register.  The feature is only available in 64-bit mode,
+register.  The feature is only available in 64-bit mode, even though there is
-even though there is theoretically space in the PAE PTEs.  These
+theoretically space in the PAE PTEs.  These permissions are enforced on data
-permissions are enforced on data access only and have no effect on
+access only and have no effect on instruction fetches.
-instruction fetches.
 Syscalls
 ========

--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -16,6 +16,7 @@
 void __flush_tlb_all(void);
 #define TLB_FLUSH_ALL	-1UL
+#define TLB_GENERATION_INVALID	0
 void cr4_update_irqsoff(unsigned long set, unsigned long clear);
 unsigned long cr4_read_shadow(void);

--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -110,7 +110,7 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
 	return vma_pkey(vma);
 }
-#define PKRU_AD_KEY(pkey)	(PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))
+#define PKRU_AD_MASK(pkey)	(PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))
 /*
 * Make the default PKRU value (at execve() time) as restrictive
@@ -118,11 +118,14 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
 * in the process's lifetime will not accidentally get access
 * to data which is pkey-protected later on.
 */
-u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
+u32 init_pkru_value = PKRU_AD_MASK( 1) | PKRU_AD_MASK( 2) |
-		      PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) |
+		      PKRU_AD_MASK( 3) | PKRU_AD_MASK( 4) |
-		      PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) |
+		      PKRU_AD_MASK( 5) | PKRU_AD_MASK( 6) |
-		      PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
+		      PKRU_AD_MASK( 7) | PKRU_AD_MASK( 8) |
-		      PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
+		      PKRU_AD_MASK( 9) | PKRU_AD_MASK(10) |
+		      PKRU_AD_MASK(11) | PKRU_AD_MASK(12) |
+		      PKRU_AD_MASK(13) | PKRU_AD_MASK(14) |
+		      PKRU_AD_MASK(15);
 static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
 			     size_t count, loff_t *ppos)

--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -734,10 +734,10 @@ static void flush_tlb_func(void *info)
 	const struct flush_tlb_info *f = info;
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
 	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
-	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
 	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
 	bool local = smp_processor_id() == f->initiating_cpu;
 	unsigned long nr_invalidate = 0;
+	u64 mm_tlb_gen;
 	/* This code cannot presently handle being reentered. */
 	VM_WARN_ON(!irqs_disabled());
@@ -771,6 +771,23 @@ static void flush_tlb_func(void *info)
 		return;
 	}
+	if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
+		     f->new_tlb_gen <= local_tlb_gen)) {
+		/*
+		 * The TLB is already up to date in respect to f->new_tlb_gen.
+		 * While the core might be still behind mm_tlb_gen, checking
+		 * mm_tlb_gen unnecessarily would have negative caching effects
+		 * so avoid it.
+		 */
+		return;
+	}
+	/*
+	 * Defer mm_tlb_gen reading as long as possible to avoid cache
+	 * contention.
+	 */
+	mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
 	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
 		/*
 		 * There's nothing to do: we're already up to date.  This can
@@ -827,6 +844,12 @@ static void flush_tlb_func(void *info)
 		/* Partial flush */
 		unsigned long addr = f->start;
+		/* Partial flush cannot have invalid generations */
+		VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID);
+		/* Partial flush must have valid mm */
+		VM_WARN_ON(f->mm == NULL);
 		nr_invalidate = (f->end - f->start) >> f->stride_shift;
 		while (addr < f->end) {
@@ -1029,7 +1052,8 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 		struct flush_tlb_info *info;
 		preempt_disable();
-		info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
+		info = get_flush_tlb_info(NULL, start, end, 0, false,
+					  TLB_GENERATION_INVALID);
 		on_each_cpu(do_kernel_range_flush, info, 1);
@@ -1198,7 +1222,8 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 	int cpu = get_cpu();
-	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0);
+	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
+				  TLB_GENERATION_INVALID);
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
 	 * a local TLB flush is needed. Optimize this use-case by calling