Commit f4dd60a3 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'x86-mm-2020-06-05' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "Misc changes:

   - Unexport various PAT primitives

   - Unexport per-CPU tlbstate and uninline TLB helpers"

* tag 'x86-mm-2020-06-05' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (23 commits)
  x86/tlb/uv: Add a forward declaration for struct flush_tlb_info
  x86/cpu: Export native_write_cr4() only when CONFIG_LKTDM=m
  x86/tlb: Restrict access to tlbstate
  xen/privcmd: Remove unneeded asm/tlb.h include
  x86/tlb: Move PCID helpers where they are used
  x86/tlb: Uninline nmi_uaccess_okay()
  x86/tlb: Move cr4_set_bits_and_update_boot() to the usage site
  x86/tlb: Move paravirt_tlb_remove_table() to the usage site
  x86/tlb: Move __flush_tlb_all() out of line
  x86/tlb: Move flush_tlb_others() out of line
  x86/tlb: Move __flush_tlb_one_kernel() out of line
  x86/tlb: Move __flush_tlb_one_user() out of line
  x86/tlb: Move __flush_tlb_global() out of line
  x86/tlb: Move __flush_tlb() out of line
  x86/alternatives: Move temporary_mm helpers into C
  x86/cr4: Sanitize CR4.PCE update
  x86/cpu: Uninline CR4 accessors
  x86/tlb: Uninline __get_current_cr3_fast()
  x86/mm: Use pgprotval_t in protval_4k_2_large() and protval_large_2_4k()
  x86/mm: Unexport __cachemode2pte_tbl
  ...
parents 435faf5c bd1de2a7
......@@ -2166,11 +2166,6 @@ static int x86_pmu_event_init(struct perf_event *event)
return err;
}
static void refresh_pce(void *ignored)
{
load_mm_cr4_irqsoff(this_cpu_read(cpu_tlbstate.loaded_mm));
}
static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
{
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
......@@ -2189,7 +2184,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
lockdep_assert_held_write(&mm->mmap_sem);
if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
}
static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
......@@ -2199,7 +2194,7 @@ static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *m
return;
if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
}
static int x86_pmu_event_idx(struct perf_event *event)
......@@ -2257,7 +2252,7 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
else if (x86_pmu.attr_rdpmc == 2)
static_branch_dec(&rdpmc_always_available_key);
on_each_cpu(refresh_pce, NULL, 1);
on_each_cpu(cr4_update_pce, NULL, 1);
x86_pmu.attr_rdpmc = val;
}
......
......@@ -24,4 +24,7 @@ extern void memtype_free_io(resource_size_t start, resource_size_t end);
extern bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn);
bool x86_has_pat_wp(void);
enum page_cache_mode pgprot2cachemode(pgprot_t pgprot);
#endif /* _ASM_X86_MEMTYPE_H */
......@@ -24,21 +24,9 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
#endif /* !CONFIG_PARAVIRT_XXL */
#ifdef CONFIG_PERF_EVENTS
DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
static inline void load_mm_cr4_irqsoff(struct mm_struct *mm)
{
if (static_branch_unlikely(&rdpmc_always_available_key) ||
(!static_branch_unlikely(&rdpmc_never_available_key) &&
atomic_read(&mm->context.perf_rdpmc_allowed)))
cr4_set_bits_irqsoff(X86_CR4_PCE);
else
cr4_clear_bits_irqsoff(X86_CR4_PCE);
}
#else
static inline void load_mm_cr4_irqsoff(struct mm_struct *mm) {}
void cr4_update_pce(void *ignored);
#endif
#ifdef CONFIG_MODIFY_LDT_SYSCALL
......@@ -225,78 +213,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return __pkru_allows_pkey(vma_pkey(vma), write);
}
/*
* This can be used from process context to figure out what the value of
* CR3 is without needing to do a (slow) __read_cr3().
*
* It's intended to be used for code like KVM that sneakily changes CR3
* and needs to restore it. It needs to be used very carefully.
*/
static inline unsigned long __get_current_cr3_fast(void)
{
unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || preemptible());
VM_BUG_ON(cr3 != __read_cr3());
return cr3;
}
typedef struct {
struct mm_struct *mm;
} temp_mm_state_t;
/*
* Using a temporary mm allows to set temporary mappings that are not accessible
* by other CPUs. Such mappings are needed to perform sensitive memory writes
* that override the kernel memory protections (e.g., W^X), without exposing the
* temporary page-table mappings that are required for these write operations to
* other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
* mapping is torn down.
*
* Context: The temporary mm needs to be used exclusively by a single core. To
* harden security IRQs must be disabled while the temporary mm is
* loaded, thereby preventing interrupt handler bugs from overriding
* the kernel memory protection.
*/
static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
{
temp_mm_state_t temp_state;
lockdep_assert_irqs_disabled();
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
switch_mm_irqs_off(NULL, mm, current);
/*
* If breakpoints are enabled, disable them while the temporary mm is
* used. Userspace might set up watchpoints on addresses that are used
* in the temporary mm, which would lead to wrong signals being sent or
* crashes.
*
* Note that breakpoints are not disabled selectively, which also causes
* kernel breakpoints (e.g., perf's) to be disabled. This might be
* undesirable, but still seems reasonable as the code that runs in the
* temporary mm should be short.
*/
if (hw_breakpoint_active())
hw_breakpoint_disable();
return temp_state;
}
static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
{
lockdep_assert_irqs_disabled();
switch_mm_irqs_off(NULL, prev_state.mm, current);
/*
* Restore the breakpoints if they were disabled before the temporary mm
* was loaded.
*/
if (hw_breakpoint_active())
hw_breakpoint_restore();
}
unsigned long __get_current_cr3_fast(void);
#endif /* _ASM_X86_MMU_CONTEXT_H */
......@@ -47,7 +47,13 @@ static inline void slow_down_io(void)
#endif
}
static inline void __flush_tlb(void)
void native_flush_tlb_local(void);
void native_flush_tlb_global(void);
void native_flush_tlb_one_user(unsigned long addr);
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
static inline void __flush_tlb_local(void)
{
PVOP_VCALL0(mmu.flush_tlb_user);
}
......@@ -62,7 +68,7 @@ static inline void __flush_tlb_one_user(unsigned long addr)
PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
}
static inline void flush_tlb_others(const struct cpumask *cpumask,
static inline void __flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info);
......
......@@ -60,7 +60,7 @@ void sync_initial_page_table(void);
#define kpte_clear_flush(ptep, vaddr) \
do { \
pte_clear(&init_mm, (vaddr), (ptep)); \
__flush_tlb_one_kernel((vaddr)); \
flush_tlb_one_kernel((vaddr)); \
} while (0)
#endif /* !__ASSEMBLY__ */
......
......@@ -471,9 +471,6 @@ static inline pteval_t pte_flags(pte_t pte)
return native_pte_val(pte) & PTE_FLAGS_MASK;
}
extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM];
extern uint8_t __pte2cachemode_tbl[8];
#define __pte2cm_idx(cb) \
((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \
(((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \
......@@ -483,43 +480,26 @@ extern uint8_t __pte2cachemode_tbl[8];
(((i) & 2) << (_PAGE_BIT_PCD - 1)) | \
(((i) & 1) << _PAGE_BIT_PWT))
static inline unsigned long cachemode2protval(enum page_cache_mode pcm)
{
if (likely(pcm == 0))
return 0;
return __cachemode2pte_tbl[pcm];
}
static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
{
return __pgprot(cachemode2protval(pcm));
}
static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
{
unsigned long masked;
unsigned long cachemode2protval(enum page_cache_mode pcm);
masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK;
if (likely(masked == 0))
return 0;
return __pte2cachemode_tbl[__pte2cm_idx(masked)];
static inline pgprotval_t protval_4k_2_large(pgprotval_t val)
{
return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
}
static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
{
pgprotval_t val = pgprot_val(pgprot);
pgprot_t new;
pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
return new;
return __pgprot(protval_4k_2_large(pgprot_val(pgprot)));
}
static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
static inline pgprotval_t protval_large_2_4k(pgprotval_t val)
{
pgprotval_t val = pgprot_val(pgprot);
pgprot_t new;
pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
((val & _PAGE_PAT_LARGE) >>
(_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
return new;
}
static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
{
return __pgprot(protval_large_2_4k(pgprot_val(pgprot)));
}
......
......@@ -13,140 +13,51 @@
#include <asm/pti.h>
#include <asm/processor-flags.h>
/*
* The x86 feature is called PCID (Process Context IDentifier). It is similar
* to what is traditionally called ASID on the RISC processors.
*
* We don't use the traditional ASID implementation, where each process/mm gets
* its own ASID and flush/restart when we run out of ASID space.
*
* Instead we have a small per-cpu array of ASIDs and cache the last few mm's
* that came by on this CPU, allowing cheaper switch_mm between processes on
* this CPU.
*
* We end up with different spaces for different things. To avoid confusion we
* use different names for each of them:
*
* ASID - [0, TLB_NR_DYN_ASIDS-1]
* the canonical identifier for an mm
*
* kPCID - [1, TLB_NR_DYN_ASIDS]
* the value we write into the PCID part of CR3; corresponds to the
* ASID+1, because PCID 0 is special.
*
* uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
* for KPTI each mm has two address spaces and thus needs two
* PCID values, but we can still do with a single ASID denomination
* for each mm. Corresponds to kPCID + 2048.
*
*/
/* There are 12 bits of space for ASIDS in CR3 */
#define CR3_HW_ASID_BITS 12
/*
* When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
* user/kernel switches
*/
#ifdef CONFIG_PAGE_TABLE_ISOLATION
# define PTI_CONSUMED_PCID_BITS 1
#else
# define PTI_CONSUMED_PCID_BITS 0
#endif
void __flush_tlb_all(void);
#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
#define TLB_FLUSH_ALL -1UL
/*
* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
* for them being zero-based. Another -1 is because PCID 0 is reserved for
* use by non-PCID-aware users.
*/
#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
void cr4_update_irqsoff(unsigned long set, unsigned long clear);
unsigned long cr4_read_shadow(void);
/*
* 6 because 6 should be plenty and struct tlb_state will fit in two cache
* lines.
*/
#define TLB_NR_DYN_ASIDS 6
/*
* Given @asid, compute kPCID
*/
static inline u16 kern_pcid(u16 asid)
/* Set in this cpu's CR4. */
static inline void cr4_set_bits_irqsoff(unsigned long mask)
{
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Make sure that the dynamic ASID space does not confict with the
* bit we are using to switch between user and kernel ASIDs.
*/
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
/*
* The ASID being passed in here should have respected the
* MAX_ASID_AVAILABLE and thus never have the switch bit set.
*/
VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
#endif
/*
* The dynamically-assigned ASIDs that get passed in are small
* (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
* so do not bother to clear it.
*
* If PCID is on, ASID-aware code paths put the ASID+1 into the
* PCID bits. This serves two purposes. It prevents a nasty
* situation in which PCID-unaware code saves CR3, loads some other
* value (with PCID == 0), and then restores CR3, thus corrupting
* the TLB for ASID 0 if the saved ASID was nonzero. It also means
* that any bugs involving loading a PCID-enabled CR3 with
* CR4.PCIDE off will trigger deterministically.
*/
return asid + 1;
cr4_update_irqsoff(mask, 0);
}
/*
* Given @asid, compute uPCID
*/
static inline u16 user_pcid(u16 asid)
/* Clear in this cpu's CR4. */
static inline void cr4_clear_bits_irqsoff(unsigned long mask)
{
u16 ret = kern_pcid(asid);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
#endif
return ret;
cr4_update_irqsoff(0, mask);
}
struct pgd_t;
static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
/* Set in this cpu's CR4. */
static inline void cr4_set_bits(unsigned long mask)
{
if (static_cpu_has(X86_FEATURE_PCID)) {
return __sme_pa(pgd) | kern_pcid(asid);
} else {
VM_WARN_ON_ONCE(asid != 0);
return __sme_pa(pgd);
}
unsigned long flags;
local_irq_save(flags);
cr4_set_bits_irqsoff(mask);
local_irq_restore(flags);
}
static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
/* Clear in this cpu's CR4. */
static inline void cr4_clear_bits(unsigned long mask)
{
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
/*
* Use boot_cpu_has() instead of this_cpu_has() as this function
* might be called during early boot. This should work even after
* boot because all CPU's the have same capabilities:
*/
VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
unsigned long flags;
local_irq_save(flags);
cr4_clear_bits_irqsoff(mask);
local_irq_restore(flags);
}
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
#define __flush_tlb() __native_flush_tlb()
#define __flush_tlb_global() __native_flush_tlb_global()
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
#endif
#ifndef MODULE
/*
* 6 because 6 should be plenty and struct tlb_state will fit in two cache
* lines.
*/
#define TLB_NR_DYN_ASIDS 6
struct tlb_context {
u64 ctx_id;
......@@ -242,38 +153,7 @@ struct tlb_state {
};
DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
/*
* Blindly accessing user memory from NMI context can be dangerous
* if we're in the middle of switching the current user task or
* switching the loaded mm. It can also be dangerous if we
* interrupted some kernel code that was temporarily using a
* different mm.
*/
static inline bool nmi_uaccess_okay(void)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
struct mm_struct *current_mm = current->mm;
VM_WARN_ON_ONCE(!loaded_mm);
/*
* The condition we want to check is
* current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
* if we're running in a VM with shadow paging, and nmi_uaccess_okay()
* is supposed to be reasonably fast.
*
* Instead, we check the almost equivalent but somewhat conservative
* condition below, and we rely on the fact that switch_mm_irqs_off()
* sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
*/
if (loaded_mm != current_mm)
return false;
VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
return true;
}
bool nmi_uaccess_okay(void);
#define nmi_uaccess_okay nmi_uaccess_okay
/* Initialize cr4 shadow for this CPU. */
......@@ -282,249 +162,11 @@ static inline void cr4_init_shadow(void)
this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
}
static inline void __cr4_set(unsigned long cr4)
{
lockdep_assert_irqs_disabled();
this_cpu_write(cpu_tlbstate.cr4, cr4);
__write_cr4(cr4);
}
/* Set in this cpu's CR4. */
static inline void cr4_set_bits_irqsoff(unsigned long mask)
{
unsigned long cr4;
cr4 = this_cpu_read(cpu_tlbstate.cr4);
if ((cr4 | mask) != cr4)
__cr4_set(cr4 | mask);
}
/* Clear in this cpu's CR4. */
static inline void cr4_clear_bits_irqsoff(unsigned long mask)
{
unsigned long cr4;
cr4 = this_cpu_read(cpu_tlbstate.cr4);
if ((cr4 & ~mask) != cr4)
__cr4_set(cr4 & ~mask);
}
/* Set in this cpu's CR4. */
static inline void cr4_set_bits(unsigned long mask)
{
unsigned long flags;
local_irq_save(flags);
cr4_set_bits_irqsoff(mask);
local_irq_restore(flags);
}
/* Clear in this cpu's CR4. */
static inline void cr4_clear_bits(unsigned long mask)
{
unsigned long flags;
local_irq_save(flags);
cr4_clear_bits_irqsoff(mask);
local_irq_restore(flags);
}
static inline void cr4_toggle_bits_irqsoff(unsigned long mask)
{
unsigned long cr4;
cr4 = this_cpu_read(cpu_tlbstate.cr4);
__cr4_set(cr4 ^ mask);
}
/* Read the CR4 shadow. */
static inline unsigned long cr4_read_shadow(void)
{
return this_cpu_read(cpu_tlbstate.cr4);
}
/*
* Mark all other ASIDs as invalid, preserves the current.
*/
static inline void invalidate_other_asid(void)
{
this_cpu_write(cpu_tlbstate.invalidate_other, true);
}
/*
* Save some of cr4 feature set we're using (e.g. Pentium 4MB
* enable and PPro Global page enable), so that any CPU's that boot
* up after us can get the correct flags. This should only be used
* during boot on the boot cpu.
*/
extern unsigned long mmu_cr4_features;
extern u32 *trampoline_cr4_features;
static inline void cr4_set_bits_and_update_boot(unsigned long mask)
{
mmu_cr4_features |= mask;
if (trampoline_cr4_features)
*trampoline_cr4_features = mmu_cr4_features;
cr4_set_bits(mask);
}
extern void initialize_tlbstate_and_flush(void);
/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
*
* See SWITCH_TO_USER_CR3.
*/
static inline void invalidate_user_asid(u16 asid)
{
/* There is no user ASID if address space separation is off */
if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
return;
/*
* We only have a single ASID if PCID is off and the CR3
* write will have flushed it.
*/
if (!cpu_feature_enabled(X86_FEATURE_PCID))
return;
if (!static_cpu_has(X86_FEATURE_PTI))
return;
__set_bit(kern_pcid(asid),
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
}
/*
* flush the entire current user mapping
*/
static inline void __native_flush_tlb(void)
{
/*
* Preemption or interrupts must be disabled to protect the access
* to the per CPU variable and to prevent being preempted between
* read_cr3() and write_cr3().
*/
WARN_ON_ONCE(preemptible());
invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* If current->mm == NULL then the read_cr3() "borrows" an mm */
native_write_cr3(__native_read_cr3());
}
/*
* flush everything
*/
static inline void __native_flush_tlb_global(void)
{
unsigned long cr4, flags;
if (static_cpu_has(X86_FEATURE_INVPCID)) {
/*
* Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore.
*
* Note, this works with CR4.PCIDE=0 or 1.
*/
invpcid_flush_all();
return;
}
/*
* Read-modify-write to CR4 - protect it from preemption and
* from interrupts. (Use the raw variant because this code can
* be called from deep inside debugging code.)
*/
raw_local_irq_save(flags);
cr4 = this_cpu_read(cpu_tlbstate.cr4);
/* toggle PGE */
native_write_cr4(cr4 ^ X86_CR4_PGE);
/* write old PGE again and flush TLBs */
native_write_cr4(cr4);
raw_local_irq_restore(flags);
}
/*
* flush one page in the user mapping
*/
static inline void __native_flush_tlb_one_user(unsigned long addr)
{
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
if (!static_cpu_has(X86_FEATURE_PTI))
return;
/*
* Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
* Just use invalidate_user_asid() in case we are called early.
*/
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
invalidate_user_asid(loaded_mm_asid);
else
invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
}
/*
* flush everything
*/
static inline void __flush_tlb_all(void)
{
/*
* This is to catch users with enabled preemption and the PGE feature
* and don't trigger the warning in __native_flush_tlb().
*/
VM_WARN_ON_ONCE(preemptible());
if (boot_cpu_has(X86_FEATURE_PGE)) {
__flush_tlb_global();
} else {
/*
* !PGE -> !PCID (setup_pcid()), thus every flush is total.
*/
__flush_tlb();
}
}
/*
* flush one page in the kernel mapping
*/
static inline void __flush_tlb_one_kernel(unsigned long addr)
{
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
/*
* If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
* paravirt equivalent. Even with PCID, this is sufficient: we only
* use PCID if we also use global PTEs for the kernel mapping, and
* INVLPG flushes global translations across all address spaces.
*
* If PTI is on, then the kernel is mapped with non-global PTEs, and
* __flush_tlb_one_user() will flush the given address for the current
* kernel address space and for its usermode counterpart, but it does
* not flush it for other address spaces.
*/
__flush_tlb_one_user(addr);
if (!static_cpu_has(X86_FEATURE_PTI))
return;
/*
* See above. We need to propagate the flush to all other address
* spaces. In principle, we only need to propagate it to kernelmode
* address spaces, but the extra bookkeeping we would need is not
* worth it.
*/
invalidate_other_asid();
}
#define TLB_FLUSH_ALL -1UL
/*
* TLB flushing:
*
......@@ -563,7 +205,15 @@ struct flush_tlb_info {
bool freed_tables;
};
#define local_flush_tlb() __flush_tlb()
void flush_tlb_local(void);
void flush_tlb_one_user(unsigned long addr);
void flush_tlb_one_kernel(unsigned long addr);
void flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
#define flush_tlb_mm(mm) \
flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
......@@ -585,9 +235,6 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
{
/*
......@@ -608,12 +255,6 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
#ifndef CONFIG_PARAVIRT
#define flush_tlb_others(mask, info) \
native_flush_tlb_others(mask, info)
#define paravirt_tlb_remove_table(tlb, page) \
tlb_remove_page(tlb, (void *)(page))
#endif
#endif /* !MODULE */
#endif /* _ASM_X86_TLBFLUSH_H */
......@@ -8,6 +8,7 @@ enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
struct cpumask;
struct mm_struct;
struct flush_tlb_info;
#ifdef CONFIG_X86_UV
#include <linux/efi.h>
......
......@@ -783,6 +783,61 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
}
}
typedef struct {
struct mm_struct *mm;
} temp_mm_state_t;
/*
* Using a temporary mm allows to set temporary mappings that are not accessible
* by other CPUs. Such mappings are needed to perform sensitive memory writes
* that override the kernel memory protections (e.g., W^X), without exposing the
* temporary page-table mappings that are required for these write operations to
* other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
* mapping is torn down.
*
* Context: The temporary mm needs to be used exclusively by a single core. To
* harden security IRQs must be disabled while the temporary mm is
* loaded, thereby preventing interrupt handler bugs from overriding
* the kernel memory protection.
*/
static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
{
temp_mm_state_t temp_state;
lockdep_assert_irqs_disabled();
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
switch_mm_irqs_off(NULL, mm, current);
/*
* If breakpoints are enabled, disable them while the temporary mm is
* used. Userspace might set up watchpoints on addresses that are used
* in the temporary mm, which would lead to wrong signals being sent or
* crashes.
*
* Note that breakpoints are not disabled selectively, which also causes
* kernel breakpoints (e.g., perf's) to be disabled. This might be
* undesirable, but still seems reasonable as the code that runs in the
* temporary mm should be short.
*/
if (hw_breakpoint_active())
hw_breakpoint_disable();
return temp_state;
}
static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
{
lockdep_assert_irqs_disabled();
switch_mm_irqs_off(NULL, prev_state.mm, current);
/*
* Restore the breakpoints if they were disabled before the temporary mm
* was loaded.
*/
if (hw_breakpoint_active())
hw_breakpoint_restore();
}
__ro_after_init struct mm_struct *poking_mm;
__ro_after_init unsigned long poking_addr;
......
......@@ -387,7 +387,30 @@ void native_write_cr4(unsigned long val)
bits_missing);
}
}
EXPORT_SYMBOL(native_write_cr4);
#if IS_MODULE(CONFIG_LKDTM)
EXPORT_SYMBOL_GPL(native_write_cr4);
#endif
void cr4_update_irqsoff(unsigned long set, unsigned long clear)
{
unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
lockdep_assert_irqs_disabled();
newval = (cr4 & ~clear) | set;
if (newval != cr4) {
this_cpu_write(cpu_tlbstate.cr4, newval);
__write_cr4(newval);
}
}
EXPORT_SYMBOL(cr4_update_irqsoff);
/* Read the CR4 shadow. */
unsigned long cr4_read_shadow(void)
{
return this_cpu_read(cpu_tlbstate.cr4);
}
EXPORT_SYMBOL_GPL(cr4_read_shadow);
void cr4_init(void)
{
......
......@@ -761,7 +761,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
flush_tlb_local();
/* Save MTRR state */
rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
......@@ -778,7 +778,7 @@ static void post_set(void) __releases(set_atomicity_lock)
{
/* Flush TLBs (no need to flush caches - they are disabled) */
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
flush_tlb_local();
/* Intel (P6) standard MTRRs */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
......
......@@ -160,25 +160,6 @@ unsigned paravirt_patch_insns(void *insn_buff, unsigned len,
return insn_len;
}
static void native_flush_tlb(void)
{
__native_flush_tlb();
}
/*
* Global pages have to be flushed a bit differently. Not a real
* performance problem because this does not happen often.
*/
static void native_flush_tlb_global(void)
{
__native_flush_tlb_global();
}
static void native_flush_tlb_one_user(unsigned long addr)
{
__native_flush_tlb_one_user(addr);
}
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
......@@ -359,7 +340,7 @@ struct paravirt_patch_template pv_ops = {
#endif /* CONFIG_PARAVIRT_XXL */
/* Mmu ops. */
.mmu.flush_tlb_user = native_flush_tlb,
.mmu.flush_tlb_user = native_flush_tlb_local,
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_others = native_flush_tlb_others,
......
......@@ -612,6 +612,17 @@ void speculation_ctrl_update_current(void)
preempt_enable();
}
static inline void cr4_toggle_bits_irqsoff(unsigned long mask)
{
unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
newval = cr4 ^ mask;
if (newval != cr4) {
this_cpu_write(cpu_tlbstate.cr4, newval);
__write_cr4(newval);
}
}
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
{
unsigned long tifp, tifn;
......
......@@ -49,7 +49,7 @@
* Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
* (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
*/
uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
[_PAGE_CACHE_MODE_WB ] = 0 | 0 ,
[_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD,
......@@ -57,9 +57,16 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
[_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD,
};
EXPORT_SYMBOL(__cachemode2pte_tbl);
uint8_t __pte2cachemode_tbl[8] = {
unsigned long cachemode2protval(enum page_cache_mode pcm)
{
if (likely(pcm == 0))
return 0;
return __cachemode2pte_tbl[pcm];
}
EXPORT_SYMBOL(cachemode2protval);
static uint8_t __pte2cachemode_tbl[8] = {
[__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB,
[__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
......@@ -69,7 +76,22 @@ uint8_t __pte2cachemode_tbl[8] = {
[__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
};
EXPORT_SYMBOL(__pte2cachemode_tbl);
/* Check that the write-protect PAT entry is set for write-protect */
bool x86_has_pat_wp(void)
{
return __pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] == _PAGE_CACHE_MODE_WP;
}
enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
{
unsigned long masked;
masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK;
if (likely(masked == 0))
return 0;
return __pte2cachemode_tbl[__pte2cm_idx(masked)];
}
static unsigned long __initdata pgt_buf_start;
static unsigned long __initdata pgt_buf_end;
......@@ -170,6 +192,19 @@ struct map_range {
static int page_size_mask;
/*
* Save some of cr4 feature set we're using (e.g. Pentium 4MB
* enable and PPro Global page enable), so that any CPU's that boot
* up after us can get the correct flags. Invoked on the boot CPU.
*/
static inline void cr4_set_bits_and_update_boot(unsigned long mask)
{
mmu_cr4_features |= mask;
if (trampoline_cr4_features)
*trampoline_cr4_features = mmu_cr4_features;
cr4_set_bits(mask);
}
static void __init probe_page_size_mask(void)
{
/*
......@@ -955,7 +990,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
{
......
......@@ -304,7 +304,7 @@ static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
__flush_tlb_one_kernel(vaddr);
flush_tlb_one_kernel(vaddr);
}
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
......@@ -373,7 +373,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
pgprot_t prot;
pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache)));
protval_4k_2_large(cachemode2protval(cache));
BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
pgd = pgd_offset_k((unsigned long)__va(phys));
......
......@@ -778,10 +778,8 @@ void __init *early_memremap_encrypted(resource_size_t phys_addr,
void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
unsigned long size)
{
/* Be sure the write-protect PAT entry is set for write-protect */
if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
if (!x86_has_pat_wp())
return NULL;
return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP);
}
......@@ -799,10 +797,8 @@ void __init *early_memremap_decrypted(resource_size_t phys_addr,
void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
unsigned long size)
{
/* Be sure the write-protect PAT entry is set for write-protect */
if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
if (!x86_has_pat_wp())
return NULL;
return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
}
#endif /* CONFIG_AMD_MEM_ENCRYPT */
......@@ -889,5 +885,5 @@ void __init __early_set_fixmap(enum fixed_addresses idx,
set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
else
pte_clear(&init_mm, addr, pte);
__flush_tlb_one_kernel(addr);
flush_tlb_one_kernel(addr);
}
......@@ -173,7 +173,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
return -1;
}
__flush_tlb_one_kernel(f->addr);
flush_tlb_one_kernel(f->addr);
return 0;
}
......
......@@ -134,7 +134,7 @@ static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
} while (size);
__native_flush_tlb();
flush_tlb_local();
}
void __init sme_unmap_bootdata(char *real_mode_data)
......
......@@ -69,6 +69,11 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_PAGES_ARRAY 4
#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
{
return __pgprot(cachemode2protval(pcm));
}
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
......@@ -341,7 +346,7 @@ static void __cpa_flush_tlb(void *data)
unsigned int i;
for (i = 0; i < cpa->numpages; i++)
__flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
}
static void cpa_flush(struct cpa_data *data, int cache)
......
......@@ -19,6 +19,14 @@ EXPORT_SYMBOL(physical_mask);
#define PGTABLE_HIGHMEM 0
#endif
#ifndef CONFIG_PARAVIRT
static inline
void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
{
tlb_remove_page(tlb, table);
}
#endif
gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
pgtable_t pte_alloc_one(struct mm_struct *mm)
......@@ -706,11 +714,9 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
if (pud_present(*pud) && !pud_huge(*pud))
return 0;
prot = pgprot_4k_2_large(prot);
set_pte((pte_t *)pud, pfn_pte(
(u64)addr >> PAGE_SHIFT,
__pgprot(pgprot_val(prot) | _PAGE_PSE)));
__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
return 1;
}
......@@ -738,11 +744,9 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
if (pmd_present(*pmd) && !pmd_huge(*pmd))
return 0;
prot = pgprot_4k_2_large(prot);
set_pte((pte_t *)pmd, pfn_pte(
(u64)addr >> PAGE_SHIFT,
__pgprot(pgprot_val(prot) | _PAGE_PSE)));
__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
return 1;
}
......
......@@ -64,7 +64,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
__flush_tlb_one_kernel(vaddr);
flush_tlb_one_kernel(vaddr);
}
unsigned long __FIXADDR_TOP = 0xfffff000;
......
......@@ -18,6 +18,16 @@
#include "mm_internal.h"
#ifdef CONFIG_PARAVIRT
# define STATIC_NOPV
#else
# define STATIC_NOPV static
# define __flush_tlb_local native_flush_tlb_local
# define __flush_tlb_global native_flush_tlb_global
# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info)
#endif
/*
* TLB flushing, formerly SMP-only
* c/o Linus Torvalds.
......@@ -38,6 +48,126 @@
*/
#define LAST_USER_MM_IBPB 0x1UL
/*
* The x86 feature is called PCID (Process Context IDentifier). It is similar
* to what is traditionally called ASID on the RISC processors.
*
* We don't use the traditional ASID implementation, where each process/mm gets
* its own ASID and flush/restart when we run out of ASID space.
*
* Instead we have a small per-cpu array of ASIDs and cache the last few mm's
* that came by on this CPU, allowing cheaper switch_mm between processes on
* this CPU.
*
* We end up with different spaces for different things. To avoid confusion we
* use different names for each of them:
*
* ASID - [0, TLB_NR_DYN_ASIDS-1]
* the canonical identifier for an mm
*
* kPCID - [1, TLB_NR_DYN_ASIDS]
* the value we write into the PCID part of CR3; corresponds to the
* ASID+1, because PCID 0 is special.
*
* uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
* for KPTI each mm has two address spaces and thus needs two
* PCID values, but we can still do with a single ASID denomination
* for each mm. Corresponds to kPCID + 2048.
*
*/
/* There are 12 bits of space for ASIDS in CR3 */
#define CR3_HW_ASID_BITS 12
/*
* When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
* user/kernel switches
*/
#ifdef CONFIG_PAGE_TABLE_ISOLATION
# define PTI_CONSUMED_PCID_BITS 1
#else
# define PTI_CONSUMED_PCID_BITS 0
#endif
#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
/*
* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
* for them being zero-based. Another -1 is because PCID 0 is reserved for
* use by non-PCID-aware users.
*/
#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
/*
* Given @asid, compute kPCID
*/
static inline u16 kern_pcid(u16 asid)
{
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Make sure that the dynamic ASID space does not confict with the
* bit we are using to switch between user and kernel ASIDs.
*/
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
/*
* The ASID being passed in here should have respected the
* MAX_ASID_AVAILABLE and thus never have the switch bit set.
*/
VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
#endif
/*
* The dynamically-assigned ASIDs that get passed in are small
* (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
* so do not bother to clear it.
*
* If PCID is on, ASID-aware code paths put the ASID+1 into the
* PCID bits. This serves two purposes. It prevents a nasty
* situation in which PCID-unaware code saves CR3, loads some other
* value (with PCID == 0), and then restores CR3, thus corrupting
* the TLB for ASID 0 if the saved ASID was nonzero. It also means
* that any bugs involving loading a PCID-enabled CR3 with
* CR4.PCIDE off will trigger deterministically.
*/
return asid + 1;
}
/*
* Given @asid, compute uPCID
*/
static inline u16 user_pcid(u16 asid)
{
u16 ret = kern_pcid(asid);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
#endif
return ret;
}
static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
{
if (static_cpu_has(X86_FEATURE_PCID)) {
return __sme_pa(pgd) | kern_pcid(asid);
} else {
VM_WARN_ON_ONCE(asid != 0);
return __sme_pa(pgd);
}
}
static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
{
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
/*
* Use boot_cpu_has() instead of this_cpu_has() as this function
* might be called during early boot. This should work even after
* boot because all CPU's the have same capabilities:
*/
VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
}
/*
* We get here when we do something requiring a TLB invalidation
* but could not go invalidate all of the contexts. We do the
......@@ -110,6 +240,32 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
*need_flush = true;
}
/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
*
* See SWITCH_TO_USER_CR3.
*/
static inline void invalidate_user_asid(u16 asid)
{
/* There is no user ASID if address space separation is off */
if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
return;
/*
* We only have a single ASID if PCID is off and the CR3
* write will have flushed it.
*/
if (!cpu_feature_enabled(X86_FEATURE_PCID))
return;
if (!static_cpu_has(X86_FEATURE_PTI))
return;
__set_bit(kern_pcid(asid),
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
}
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
{
unsigned long new_mm_cr3;
......@@ -244,6 +400,26 @@ static void cond_ibpb(struct task_struct *next)
}
}
#ifdef CONFIG_PERF_EVENTS
static inline void cr4_update_pce_mm(struct mm_struct *mm)
{
if (static_branch_unlikely(&rdpmc_always_available_key) ||
(!static_branch_unlikely(&rdpmc_never_available_key) &&
atomic_read(&mm->context.perf_rdpmc_allowed)))
cr4_set_bits_irqsoff(X86_CR4_PCE);
else
cr4_clear_bits_irqsoff(X86_CR4_PCE);
}
void cr4_update_pce(void *ignored)
{
cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm));
}
#else
static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
#endif
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
......@@ -403,7 +579,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
if (next != real_prev) {
load_mm_cr4_irqsoff(next);
cr4_update_pce_mm(next);
switch_ldt(real_prev, next);
}
}
......@@ -580,7 +756,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
unsigned long addr = f->start;
while (addr < f->end) {
__flush_tlb_one_user(addr);
flush_tlb_one_user(addr);
addr += 1UL << f->stride_shift;
}
if (local)
......@@ -588,7 +764,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
trace_tlb_flush(reason, nr_invalidate);
} else {
/* Full flush. */
local_flush_tlb();
flush_tlb_local();
if (local)
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
trace_tlb_flush(reason, TLB_FLUSH_ALL);
......@@ -623,7 +799,7 @@ static bool tlb_is_not_lazy(int cpu, void *data)
return !per_cpu(cpu_tlbstate.is_lazy, cpu);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
......@@ -674,6 +850,12 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(void *)info, 1, cpumask);
}
void flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
__flush_tlb_others(cpumask, info);
}
/*
* See Documentation/x86/tlb.rst for details. We choose 33
* because it is large enough to cover the vast majority (at
......@@ -784,7 +966,7 @@ static void do_kernel_range_flush(void *info)
/* flush range by one by one 'invlpg' */
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
__flush_tlb_one_kernel(addr);
flush_tlb_one_kernel(addr);
}
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
......@@ -806,6 +988,164 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
}
}
/*
* This can be used from process context to figure out what the value of
* CR3 is without needing to do a (slow) __read_cr3().
*
* It's intended to be used for code like KVM that sneakily changes CR3
* and needs to restore it. It needs to be used very carefully.
*/
unsigned long __get_current_cr3_fast(void)
{
unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || preemptible());
VM_BUG_ON(cr3 != __read_cr3());
return cr3;
}
EXPORT_SYMBOL_GPL(__get_current_cr3_fast);
/*
* Flush one page in the kernel mapping
*/
void flush_tlb_one_kernel(unsigned long addr)
{
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
/*
* If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
* paravirt equivalent. Even with PCID, this is sufficient: we only
* use PCID if we also use global PTEs for the kernel mapping, and
* INVLPG flushes global translations across all address spaces.
*
* If PTI is on, then the kernel is mapped with non-global PTEs, and
* __flush_tlb_one_user() will flush the given address for the current
* kernel address space and for its usermode counterpart, but it does
* not flush it for other address spaces.
*/
flush_tlb_one_user(addr);
if (!static_cpu_has(X86_FEATURE_PTI))
return;
/*
* See above. We need to propagate the flush to all other address
* spaces. In principle, we only need to propagate it to kernelmode
* address spaces, but the extra bookkeeping we would need is not
* worth it.
*/
this_cpu_write(cpu_tlbstate.invalidate_other, true);
}
/*
* Flush one page in the user mapping
*/
STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
{
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
if (!static_cpu_has(X86_FEATURE_PTI))
return;
/*
* Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
* Just use invalidate_user_asid() in case we are called early.
*/
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
invalidate_user_asid(loaded_mm_asid);
else
invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
}
void flush_tlb_one_user(unsigned long addr)
{
__flush_tlb_one_user(addr);
}
/*
* Flush everything
*/
STATIC_NOPV void native_flush_tlb_global(void)
{
unsigned long cr4, flags;
if (static_cpu_has(X86_FEATURE_INVPCID)) {
/*
* Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore.
*
* Note, this works with CR4.PCIDE=0 or 1.
*/
invpcid_flush_all();
return;
}
/*
* Read-modify-write to CR4 - protect it from preemption and
* from interrupts. (Use the raw variant because this code can
* be called from deep inside debugging code.)
*/
raw_local_irq_save(flags);
cr4 = this_cpu_read(cpu_tlbstate.cr4);
/* toggle PGE */
native_write_cr4(cr4 ^ X86_CR4_PGE);
/* write old PGE again and flush TLBs */
native_write_cr4(cr4);
raw_local_irq_restore(flags);
}
/*
* Flush the entire current user mapping
*/
STATIC_NOPV void native_flush_tlb_local(void)
{
/*
* Preemption or interrupts must be disabled to protect the access
* to the per CPU variable and to prevent being preempted between
* read_cr3() and write_cr3().
*/
WARN_ON_ONCE(preemptible());
invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* If current->mm == NULL then the read_cr3() "borrows" an mm */
native_write_cr3(__native_read_cr3());
}
void flush_tlb_local(void)
{
__flush_tlb_local();
}
/*
* Flush everything
*/
void __flush_tlb_all(void)
{
/*
* This is to catch users with enabled preemption and the PGE feature
* and don't trigger the warning in __native_flush_tlb().
*/
VM_WARN_ON_ONCE(preemptible());
if (boot_cpu_has(X86_FEATURE_PGE)) {
__flush_tlb_global();
} else {
/*
* !PGE -> !PCID (setup_pcid()), thus every flush is total.
*/
flush_tlb_local();
}
}
EXPORT_SYMBOL_GPL(__flush_tlb_all);
/*
* arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
* This means that the 'struct flush_tlb_info' that describes which mappings to
......@@ -837,6 +1177,38 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
put_cpu();
}
/*
* Blindly accessing user memory from NMI context can be dangerous
* if we're in the middle of switching the current user task or
* switching the loaded mm. It can also be dangerous if we
* interrupted some kernel code that was temporarily using a
* different mm.
*/
bool nmi_uaccess_okay(void)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
struct mm_struct *current_mm = current->mm;
VM_WARN_ON_ONCE(!loaded_mm);
/*
* The condition we want to check is
* current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
* if we're running in a VM with shadow paging, and nmi_uaccess_okay()
* is supposed to be reasonably fast.
*
* Instead, we check the almost equivalent but somewhat conservative
* condition below, and we rely on the fact that switch_mm_irqs_off()
* sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
*/
if (loaded_mm != current_mm)
return false;
VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
return true;
}
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
......
......@@ -293,10 +293,10 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
* This must be a normal message, or retry of a normal message
*/
if (msg->address == TLB_FLUSH_ALL) {
local_flush_tlb();
flush_tlb_local();
stat->d_alltlb++;
} else {
__flush_tlb_one_user(msg->address);
flush_tlb_one_user(msg->address);
stat->d_onetlb++;
}
stat->d_requestee++;
......
......@@ -27,7 +27,6 @@
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/tlb.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment