Commit 6fd166aa authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar

x86/mm: Use/Fix PCID to optimize user/kernel switches

We can use PCID to retain the TLBs across CR3 switches; including those now
part of the user/kernel switch. This increases performance of kernel
entry/exit at the cost of more expensive/complicated TLB flushing.

Now that we have two address spaces, one for kernel and one for user space,
we need two PCIDs per mm. We use the top PCID bit to indicate a user PCID
(just like we use the PFN LSB for the PGD). Since we do TLB invalidation
from kernel space, the existing code will only invalidate the kernel PCID,
we augment that by marking the corresponding user PCID invalid, and upon
switching back to userspace, use a flushing CR3 write for the switch.

In order to access the user_pcid_flush_mask we use PER_CPU storage, which
means the previously established SWAPGS vs CR3 ordering is now mandatory
and required.

Having to do this memory access does require additional registers, most
sites have a functioning stack and we can spill one (RAX), sites without
functional stack need to otherwise provide the second scratch register.

Note: PCID is generally available on Intel Sandybridge and later CPUs.
Note: Up until this point TLB flushing was broken in this series.

Based-on-code-from: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 48e11198
...@@ -3,6 +3,9 @@ ...@@ -3,6 +3,9 @@
#include <asm/unwind_hints.h> #include <asm/unwind_hints.h>
#include <asm/cpufeatures.h> #include <asm/cpufeatures.h>
#include <asm/page_types.h> #include <asm/page_types.h>
#include <asm/percpu.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
/* /*
...@@ -191,17 +194,21 @@ For 32-bit we have the following conventions - kernel is built with ...@@ -191,17 +194,21 @@ For 32-bit we have the following conventions - kernel is built with
#ifdef CONFIG_PAGE_TABLE_ISOLATION #ifdef CONFIG_PAGE_TABLE_ISOLATION
/* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two halves: */ /*
#define PTI_SWITCH_MASK (1<<PAGE_SHIFT) * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
* halves:
*/
#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
.macro ADJUST_KERNEL_CR3 reg:req .macro SET_NOFLUSH_BIT reg:req
/* Clear "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
andq $(~PTI_SWITCH_MASK), \reg
.endm .endm
.macro ADJUST_USER_CR3 reg:req .macro ADJUST_KERNEL_CR3 reg:req
/* Move CR3 up a page to the user page tables: */ ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
orq $(PTI_SWITCH_MASK), \reg /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
andq $(~PTI_SWITCH_MASK), \reg
.endm .endm
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
...@@ -212,21 +219,58 @@ For 32-bit we have the following conventions - kernel is built with ...@@ -212,21 +219,58 @@ For 32-bit we have the following conventions - kernel is built with
.Lend_\@: .Lend_\@:
.endm .endm
.macro SWITCH_TO_USER_CR3 scratch_reg:req #define THIS_CPU_user_pcid_flush_mask \
PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
mov %cr3, \scratch_reg mov %cr3, \scratch_reg
ADJUST_USER_CR3 \scratch_reg
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
/*
* Test if the ASID needs a flush.
*/
movq \scratch_reg, \scratch_reg2
andq $(0x7FF), \scratch_reg /* mask ASID */
bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
jnc .Lnoflush_\@
/* Flush needed, clear the bit */
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
movq \scratch_reg2, \scratch_reg
jmp .Lwrcr3_\@
.Lnoflush_\@:
movq \scratch_reg2, \scratch_reg
SET_NOFLUSH_BIT \scratch_reg
.Lwrcr3_\@:
/* Flip the PGD and ASID to the user version */
orq $(PTI_SWITCH_MASK), \scratch_reg
mov \scratch_reg, %cr3 mov \scratch_reg, %cr3
.Lend_\@: .Lend_\@:
.endm .endm
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
pushq %rax
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
popq %rax
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
movq %cr3, \scratch_reg movq %cr3, \scratch_reg
movq \scratch_reg, \save_reg movq \scratch_reg, \save_reg
/* /*
* Is the switch bit zero? This means the address is * Is the "switch mask" all zero? That means that both of
* up in real PAGE_TABLE_ISOLATION patches in a moment. * these are zero:
*
* 1. The user/kernel PCID bit, and
* 2. The user/kernel "bit" that points CR3 to the
* bottom half of the 8k PGD
*
* That indicates a kernel CR3 value, not a user CR3.
*/ */
testq $(PTI_SWITCH_MASK), \scratch_reg testq $(PTI_SWITCH_MASK), \scratch_reg
jz .Ldone_\@ jz .Ldone_\@
...@@ -251,7 +295,9 @@ For 32-bit we have the following conventions - kernel is built with ...@@ -251,7 +295,9 @@ For 32-bit we have the following conventions - kernel is built with
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
.endm .endm
.macro SWITCH_TO_USER_CR3 scratch_reg:req .macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
.endm
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
.endm .endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
.endm .endm
......
...@@ -23,7 +23,6 @@ ...@@ -23,7 +23,6 @@
#include <asm/segment.h> #include <asm/segment.h>
#include <asm/cache.h> #include <asm/cache.h>
#include <asm/errno.h> #include <asm/errno.h>
#include "calling.h"
#include <asm/asm-offsets.h> #include <asm/asm-offsets.h>
#include <asm/msr.h> #include <asm/msr.h>
#include <asm/unistd.h> #include <asm/unistd.h>
...@@ -40,6 +39,8 @@ ...@@ -40,6 +39,8 @@
#include <asm/frame.h> #include <asm/frame.h>
#include <linux/err.h> #include <linux/err.h>
#include "calling.h"
.code64 .code64
.section .entry.text, "ax" .section .entry.text, "ax"
...@@ -406,7 +407,7 @@ syscall_return_via_sysret: ...@@ -406,7 +407,7 @@ syscall_return_via_sysret:
* We are on the trampoline stack. All regs except RDI are live. * We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here. * We can do future final exit work right here.
*/ */
SWITCH_TO_USER_CR3 scratch_reg=%rdi SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
popq %rdi popq %rdi
popq %rsp popq %rsp
...@@ -744,7 +745,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) ...@@ -744,7 +745,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
* We can do future final exit work right here. * We can do future final exit work right here.
*/ */
SWITCH_TO_USER_CR3 scratch_reg=%rdi SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
/* Restore RDI. */ /* Restore RDI. */
popq %rdi popq %rdi
...@@ -857,7 +858,7 @@ native_irq_return_ldt: ...@@ -857,7 +858,7 @@ native_irq_return_ldt:
*/ */
orq PER_CPU_VAR(espfix_stack), %rax orq PER_CPU_VAR(espfix_stack), %rax
SWITCH_TO_USER_CR3 scratch_reg=%rdi /* to user CR3 */ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
SWAPGS /* to user GS */ SWAPGS /* to user GS */
popq %rdi /* Restore user RDI */ popq %rdi /* Restore user RDI */
......
...@@ -275,9 +275,9 @@ sysret32_from_system_call: ...@@ -275,9 +275,9 @@ sysret32_from_system_call:
* switch until after after the last reference to the process * switch until after after the last reference to the process
* stack. * stack.
* *
* %r8 is zeroed before the sysret, thus safe to clobber. * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
*/ */
SWITCH_TO_USER_CR3 scratch_reg=%r8 SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
xorq %r8, %r8 xorq %r8, %r8
xorq %r9, %r9 xorq %r9, %r9
......
...@@ -38,6 +38,11 @@ ...@@ -38,6 +38,11 @@
#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) #define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
#define CR3_PCID_MASK 0xFFFull #define CR3_PCID_MASK 0xFFFull
#define CR3_NOFLUSH BIT_ULL(63) #define CR3_NOFLUSH BIT_ULL(63)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
# define X86_CR3_PTI_SWITCH_BIT 11
#endif
#else #else
/* /*
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
......
...@@ -10,6 +10,8 @@ ...@@ -10,6 +10,8 @@
#include <asm/special_insns.h> #include <asm/special_insns.h>
#include <asm/smp.h> #include <asm/smp.h>
#include <asm/invpcid.h> #include <asm/invpcid.h>
#include <asm/pti.h>
#include <asm/processor-flags.h>
static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
{ {
...@@ -24,24 +26,54 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) ...@@ -24,24 +26,54 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
/* There are 12 bits of space for ASIDS in CR3 */ /* There are 12 bits of space for ASIDS in CR3 */
#define CR3_HW_ASID_BITS 12 #define CR3_HW_ASID_BITS 12
/* /*
* When enabled, PAGE_TABLE_ISOLATION consumes a single bit for * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
* user/kernel switches * user/kernel switches
*/ */
#define PTI_CONSUMED_ASID_BITS 0 #ifdef CONFIG_PAGE_TABLE_ISOLATION
# define PTI_CONSUMED_PCID_BITS 1
#else
# define PTI_CONSUMED_PCID_BITS 0
#endif
#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
/* /*
* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
* for them being zero-based. Another -1 is because ASID 0 is reserved for * for them being zero-based. Another -1 is because ASID 0 is reserved for
* use by non-PCID-aware users. * use by non-PCID-aware users.
*/ */
#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
/*
* 6 because 6 should be plenty and struct tlb_state will fit in two cache
* lines.
*/
#define TLB_NR_DYN_ASIDS 6
static inline u16 kern_pcid(u16 asid) static inline u16 kern_pcid(u16 asid)
{ {
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Make sure that the dynamic ASID space does not confict with the
* bit we are using to switch between user and kernel ASIDs.
*/
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
/*
* The ASID being passed in here should have respected the
* MAX_ASID_AVAILABLE and thus never have the switch bit set.
*/
VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
#endif
/* /*
* The dynamically-assigned ASIDs that get passed in are small
* (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
* so do not bother to clear it.
*
* If PCID is on, ASID-aware code paths put the ASID+1 into the * If PCID is on, ASID-aware code paths put the ASID+1 into the
* PCID bits. This serves two purposes. It prevents a nasty * PCID bits. This serves two purposes. It prevents a nasty
* situation in which PCID-unaware code saves CR3, loads some other * situation in which PCID-unaware code saves CR3, loads some other
...@@ -95,12 +127,6 @@ static inline bool tlb_defer_switch_to_init_mm(void) ...@@ -95,12 +127,6 @@ static inline bool tlb_defer_switch_to_init_mm(void)
return !static_cpu_has(X86_FEATURE_PCID); return !static_cpu_has(X86_FEATURE_PCID);
} }
/*
* 6 because 6 should be plenty and struct tlb_state will fit in
* two cache lines.
*/
#define TLB_NR_DYN_ASIDS 6
struct tlb_context { struct tlb_context {
u64 ctx_id; u64 ctx_id;
u64 tlb_gen; u64 tlb_gen;
...@@ -145,6 +171,13 @@ struct tlb_state { ...@@ -145,6 +171,13 @@ struct tlb_state {
*/ */
bool invalidate_other; bool invalidate_other;
/*
* Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
* the corresponding user PCID needs a flush next time we
* switch to it; see SWITCH_TO_USER_CR3.
*/
unsigned short user_pcid_flush_mask;
/* /*
* Access to this CR4 shadow and to H/W CR4 is protected by * Access to this CR4 shadow and to H/W CR4 is protected by
* disabling interrupts when modifying either one. * disabling interrupts when modifying either one.
...@@ -249,15 +282,42 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) ...@@ -249,15 +282,42 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
extern void initialize_tlbstate_and_flush(void); extern void initialize_tlbstate_and_flush(void);
/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
*
* See SWITCH_TO_USER_CR3.
*/
static inline void invalidate_user_asid(u16 asid)
{
/* There is no user ASID if address space separation is off */
if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
return;
/*
* We only have a single ASID if PCID is off and the CR3
* write will have flushed it.
*/
if (!cpu_feature_enabled(X86_FEATURE_PCID))
return;
if (!static_cpu_has(X86_FEATURE_PTI))
return;
__set_bit(kern_pcid(asid),
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
}
/* /*
* flush the entire current user mapping * flush the entire current user mapping
*/ */
static inline void __native_flush_tlb(void) static inline void __native_flush_tlb(void)
{ {
invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* /*
* If current->mm == NULL then we borrow a mm which may change during a * If current->mm == NULL then we borrow a mm which may change
* task switch and therefore we must not be preempted while we write CR3 * during a task switch and therefore we must not be preempted
* back: * while we write CR3 back:
*/ */
preempt_disable(); preempt_disable();
native_write_cr3(__native_read_cr3()); native_write_cr3(__native_read_cr3());
...@@ -301,7 +361,14 @@ static inline void __native_flush_tlb_global(void) ...@@ -301,7 +361,14 @@ static inline void __native_flush_tlb_global(void)
*/ */
static inline void __native_flush_tlb_single(unsigned long addr) static inline void __native_flush_tlb_single(unsigned long addr)
{ {
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
if (!static_cpu_has(X86_FEATURE_PTI))
return;
invalidate_user_asid(loaded_mm_asid);
} }
/* /*
......
...@@ -78,7 +78,12 @@ ...@@ -78,7 +78,12 @@
#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
#define X86_CR3_PCID_BITS 12
#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
/* /*
* Intel CPU features in CR4 * Intel CPU features in CR4
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <asm/sigframe.h> #include <asm/sigframe.h>
#include <asm/bootparam.h> #include <asm/bootparam.h>
#include <asm/suspend.h> #include <asm/suspend.h>
#include <asm/tlbflush.h>
#ifdef CONFIG_XEN #ifdef CONFIG_XEN
#include <xen/interface/xen.h> #include <xen/interface/xen.h>
...@@ -94,6 +95,9 @@ void common(void) { ...@@ -94,6 +95,9 @@ void common(void) {
BLANK(); BLANK();
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
/* TLB state for the entry code */
OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
/* Layout info for cpu_entry_area */ /* Layout info for cpu_entry_area */
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
......
...@@ -855,7 +855,7 @@ void __init zone_sizes_init(void) ...@@ -855,7 +855,7 @@ void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns); free_area_init_nodes(max_zone_pfns);
} }
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm, .loaded_mm = &init_mm,
.next_asid = 1, .next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
......
...@@ -105,6 +105,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) ...@@ -105,6 +105,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
unsigned long new_mm_cr3; unsigned long new_mm_cr3;
if (need_flush) { if (need_flush) {
invalidate_user_asid(new_asid);
new_mm_cr3 = build_cr3(pgdir, new_asid); new_mm_cr3 = build_cr3(pgdir, new_asid);
} else { } else {
new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment