Commit 0bc40e54 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "The changes in here are:

   - text_poke() fixes and an extensive set of executability lockdowns,
     to (hopefully) eliminate the last residual circumstances under
     which we are using W|X mappings even temporarily on x86 kernels.
     This required a broad range of surgery in text patching facilities,
     module loading, trampoline handling and other bits.

   - tweak page fault messages to be more informative and more
     structured.

   - remove DISCONTIGMEM support on x86-32 and make SPARSEMEM the
     default.

   - reduce KASLR granularity on 5-level paging kernels from 512 GB to
     1 GB.

   - misc other changes and updates"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits)
  x86/mm: Initialize PGD cache during mm initialization
  x86/alternatives: Add comment about module removal races
  x86/kprobes: Use vmalloc special flag
  x86/ftrace: Use vmalloc special flag
  bpf: Use vmalloc special flag
  modules: Use vmalloc special flag
  mm/vmalloc: Add flag for freeing of special permsissions
  mm/hibernation: Make hibernation handle unmapped pages
  x86/mm/cpa: Add set_direct_map_*() functions
  x86/alternatives: Remove the return value of text_poke_*()
  x86/jump-label: Remove support for custom text poker
  x86/modules: Avoid breaking W^X while loading modules
  x86/kprobes: Set instruction page as executable
  x86/ftrace: Set trampoline pages as executable
  x86/kgdb: Avoid redundant comparison of patched code
  x86/alternatives: Use temporary mm for text poking
  x86/alternatives: Initialize temporary mm for patching
  fork: Provide a function for copying init_mm
  uprobes: Initialize uprobes earlier
  x86/mm: Save debug registers when loading a temporary mm
  ...
parents e913c4a4 caa84136
...@@ -72,7 +72,7 @@ Complete virtual memory map with 5-level page tables ...@@ -72,7 +72,7 @@ Complete virtual memory map with 5-level page tables
Notes: Notes:
- With 56-bit addresses, user-space memory gets expanded by a factor of 512x, - With 56-bit addresses, user-space memory gets expanded by a factor of 512x,
from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PB starting
offset and many of the regions expand to support the much larger physical offset and many of the regions expand to support the much larger physical
memory supported. memory supported.
...@@ -83,7 +83,7 @@ Notes: ...@@ -83,7 +83,7 @@ Notes:
0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm 0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm
__________________|____________|__________________|_________|___________________________________________________________ __________________|____________|__________________|_________|___________________________________________________________
| | | | | | | |
0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical 0100000000000000 | +64 PB | feffffffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical
| | | | virtual memory addresses up to the -64 PB | | | | virtual memory addresses up to the -64 PB
| | | | starting offset of kernel mappings. | | | | starting offset of kernel mappings.
__________________|____________|__________________|_________|___________________________________________________________ __________________|____________|__________________|_________|___________________________________________________________
...@@ -99,7 +99,7 @@ ____________________________________________________________|___________________ ...@@ -99,7 +99,7 @@ ____________________________________________________________|___________________
ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole
ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base) ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base)
ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole
ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory ffdf000000000000 | -8.25 PB | fffffbffffffffff | ~8 PB | KASAN shadow memory
__________________|____________|__________________|_________|____________________________________________________________ __________________|____________|__________________|_________|____________________________________________________________
| |
| Identical layout to the 47-bit one from here on: | Identical layout to the 47-bit one from here on:
......
...@@ -249,6 +249,10 @@ config ARCH_HAS_FORTIFY_SOURCE ...@@ -249,6 +249,10 @@ config ARCH_HAS_FORTIFY_SOURCE
config ARCH_HAS_SET_MEMORY config ARCH_HAS_SET_MEMORY
bool bool
# Select if arch has all set_direct_map_invalid/default() functions
config ARCH_HAS_SET_DIRECT_MAP
bool
# Select if arch init_task must go in the __init_task_data section # Select if arch init_task must go in the __init_task_data section
config ARCH_TASK_STRUCT_ON_STACK config ARCH_TASK_STRUCT_ON_STACK
bool bool
......
...@@ -65,6 +65,7 @@ config X86 ...@@ -65,6 +65,7 @@ config X86
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE
select ARCH_HAS_SET_MEMORY select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SET_DIRECT_MAP
select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_STRICT_MODULE_RWX
select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
...@@ -1592,12 +1593,9 @@ config ARCH_FLATMEM_ENABLE ...@@ -1592,12 +1593,9 @@ config ARCH_FLATMEM_ENABLE
depends on X86_32 && !NUMA depends on X86_32 && !NUMA
config ARCH_DISCONTIGMEM_ENABLE config ARCH_DISCONTIGMEM_ENABLE
def_bool y def_bool n
depends on NUMA && X86_32
config ARCH_DISCONTIGMEM_DEFAULT
def_bool y
depends on NUMA && X86_32 depends on NUMA && X86_32
depends on BROKEN
config ARCH_SPARSEMEM_ENABLE config ARCH_SPARSEMEM_ENABLE
def_bool y def_bool y
...@@ -1606,8 +1604,7 @@ config ARCH_SPARSEMEM_ENABLE ...@@ -1606,8 +1604,7 @@ config ARCH_SPARSEMEM_ENABLE
select SPARSEMEM_VMEMMAP_ENABLE if X86_64 select SPARSEMEM_VMEMMAP_ENABLE if X86_64
config ARCH_SPARSEMEM_DEFAULT config ARCH_SPARSEMEM_DEFAULT
def_bool y def_bool X86_64 || (NUMA && X86_32)
depends on X86_64
config ARCH_SELECT_MEMORY_MODEL config ARCH_SELECT_MEMORY_MODEL
def_bool y def_bool y
......
...@@ -103,8 +103,6 @@ enum fixed_addresses { ...@@ -103,8 +103,6 @@ enum fixed_addresses {
#ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT
FIX_PARAVIRT_BOOTMAP, FIX_PARAVIRT_BOOTMAP,
#endif #endif
FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
#ifdef CONFIG_X86_INTEL_MID #ifdef CONFIG_X86_INTEL_MID
FIX_LNW_VRTC, FIX_LNW_VRTC,
#endif #endif
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/paravirt.h> #include <asm/paravirt.h>
#include <asm/mpx.h> #include <asm/mpx.h>
#include <asm/debugreg.h>
extern atomic64_t last_mm_ctx_id; extern atomic64_t last_mm_ctx_id;
...@@ -356,4 +357,59 @@ static inline unsigned long __get_current_cr3_fast(void) ...@@ -356,4 +357,59 @@ static inline unsigned long __get_current_cr3_fast(void)
return cr3; return cr3;
} }
typedef struct {
struct mm_struct *mm;
} temp_mm_state_t;
/*
* Using a temporary mm allows to set temporary mappings that are not accessible
* by other CPUs. Such mappings are needed to perform sensitive memory writes
* that override the kernel memory protections (e.g., W^X), without exposing the
* temporary page-table mappings that are required for these write operations to
* other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
* mapping is torn down.
*
* Context: The temporary mm needs to be used exclusively by a single core. To
* harden security IRQs must be disabled while the temporary mm is
* loaded, thereby preventing interrupt handler bugs from overriding
* the kernel memory protection.
*/
static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
{
temp_mm_state_t temp_state;
lockdep_assert_irqs_disabled();
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
switch_mm_irqs_off(NULL, mm, current);
/*
* If breakpoints are enabled, disable them while the temporary mm is
* used. Userspace might set up watchpoints on addresses that are used
* in the temporary mm, which would lead to wrong signals being sent or
* crashes.
*
* Note that breakpoints are not disabled selectively, which also causes
* kernel breakpoints (e.g., perf's) to be disabled. This might be
* undesirable, but still seems reasonable as the code that runs in the
* temporary mm should be short.
*/
if (hw_breakpoint_active())
hw_breakpoint_disable();
return temp_state;
}
static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
{
lockdep_assert_irqs_disabled();
switch_mm_irqs_off(NULL, prev_state.mm, current);
/*
* Restore the breakpoints if they were disabled before the temporary mm
* was loaded.
*/
if (hw_breakpoint_active())
hw_breakpoint_restore();
}
#endif /* _ASM_X86_MMU_CONTEXT_H */ #endif /* _ASM_X86_MMU_CONTEXT_H */
...@@ -1021,6 +1021,9 @@ static inline void __meminit init_trampoline_default(void) ...@@ -1021,6 +1021,9 @@ static inline void __meminit init_trampoline_default(void)
/* Default trampoline pgd value */ /* Default trampoline pgd value */
trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
} }
void __init poking_init(void);
# ifdef CONFIG_RANDOMIZE_MEMORY # ifdef CONFIG_RANDOMIZE_MEMORY
void __meminit init_trampoline(void); void __meminit init_trampoline(void);
# else # else
......
...@@ -85,6 +85,9 @@ int set_pages_nx(struct page *page, int numpages); ...@@ -85,6 +85,9 @@ int set_pages_nx(struct page *page, int numpages);
int set_pages_ro(struct page *page, int numpages); int set_pages_ro(struct page *page, int numpages);
int set_pages_rw(struct page *page, int numpages); int set_pages_rw(struct page *page, int numpages);
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
extern int kernel_set_to_readonly; extern int kernel_set_to_readonly;
void set_kernel_text_rw(void); void set_kernel_text_rw(void);
void set_kernel_text_ro(void); void set_kernel_text_ro(void);
......
...@@ -18,7 +18,7 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, ...@@ -18,7 +18,7 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
#define __parainstructions_end NULL #define __parainstructions_end NULL
#endif #endif
extern void *text_poke_early(void *addr, const void *opcode, size_t len); extern void text_poke_early(void *addr, const void *opcode, size_t len);
/* /*
* Clear and restore the kernel write-protection flag on the local CPU. * Clear and restore the kernel write-protection flag on the local CPU.
...@@ -35,8 +35,11 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); ...@@ -35,8 +35,11 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
* inconsistent instruction while you patch. * inconsistent instruction while you patch.
*/ */
extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
extern int poke_int3_handler(struct pt_regs *regs); extern int poke_int3_handler(struct pt_regs *regs);
extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
extern int after_bootmem; extern int after_bootmem;
extern __ro_after_init struct mm_struct *poking_mm;
extern __ro_after_init unsigned long poking_addr;
#endif /* _ASM_X86_TEXT_PATCHING_H */ #endif /* _ASM_X86_TEXT_PATCHING_H */
...@@ -274,6 +274,8 @@ static inline bool nmi_uaccess_okay(void) ...@@ -274,6 +274,8 @@ static inline bool nmi_uaccess_okay(void)
return true; return true;
} }
#define nmi_uaccess_okay nmi_uaccess_okay
/* Initialize cr4 shadow for this CPU. */ /* Initialize cr4 shadow for this CPU. */
static inline void cr4_init_shadow(void) static inline void cr4_init_shadow(void)
{ {
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/kdebug.h> #include <linux/kdebug.h>
#include <linux/kprobes.h> #include <linux/kprobes.h>
#include <linux/mmu_context.h>
#include <asm/text-patching.h> #include <asm/text-patching.h>
#include <asm/alternative.h> #include <asm/alternative.h>
#include <asm/sections.h> #include <asm/sections.h>
...@@ -264,7 +265,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) ...@@ -264,7 +265,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[]; extern s32 __smp_locks[], __smp_locks_end[];
void *text_poke_early(void *addr, const void *opcode, size_t len); void text_poke_early(void *addr, const void *opcode, size_t len);
/* /*
* Are we looking at a near JMP with a 1 or 4-byte displacement. * Are we looking at a near JMP with a 1 or 4-byte displacement.
...@@ -666,16 +667,136 @@ void __init alternative_instructions(void) ...@@ -666,16 +667,136 @@ void __init alternative_instructions(void)
* instructions. And on the local CPU you need to be protected again NMI or MCE * instructions. And on the local CPU you need to be protected again NMI or MCE
* handlers seeing an inconsistent instruction while you patch. * handlers seeing an inconsistent instruction while you patch.
*/ */
void *__init_or_module text_poke_early(void *addr, const void *opcode, void __init_or_module text_poke_early(void *addr, const void *opcode,
size_t len) size_t len)
{ {
unsigned long flags; unsigned long flags;
if (boot_cpu_has(X86_FEATURE_NX) &&
is_module_text_address((unsigned long)addr)) {
/*
* Modules text is marked initially as non-executable, so the
* code cannot be running and speculative code-fetches are
* prevented. Just change the code.
*/
memcpy(addr, opcode, len);
} else {
local_irq_save(flags);
memcpy(addr, opcode, len);
local_irq_restore(flags);
sync_core();
/*
* Could also do a CLFLUSH here to speed up CPU recovery; but
* that causes hangs on some VIA CPUs.
*/
}
}
__ro_after_init struct mm_struct *poking_mm;
__ro_after_init unsigned long poking_addr;
static void *__text_poke(void *addr, const void *opcode, size_t len)
{
bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
struct page *pages[2] = {NULL};
temp_mm_state_t prev;
unsigned long flags;
pte_t pte, *ptep;
spinlock_t *ptl;
pgprot_t pgprot;
/*
* While boot memory allocator is running we cannot use struct pages as
* they are not yet initialized. There is no way to recover.
*/
BUG_ON(!after_bootmem);
if (!core_kernel_text((unsigned long)addr)) {
pages[0] = vmalloc_to_page(addr);
if (cross_page_boundary)
pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
} else {
pages[0] = virt_to_page(addr);
WARN_ON(!PageReserved(pages[0]));
if (cross_page_boundary)
pages[1] = virt_to_page(addr + PAGE_SIZE);
}
/*
* If something went wrong, crash and burn since recovery paths are not
* implemented.
*/
BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
local_irq_save(flags); local_irq_save(flags);
memcpy(addr, opcode, len);
/*
* Map the page without the global bit, as TLB flushing is done with
* flush_tlb_mm_range(), which is intended for non-global PTEs.
*/
pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
/*
* The lock is not really needed, but this allows to avoid open-coding.
*/
ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
/*
* This must not fail; preallocated in poking_init().
*/
VM_BUG_ON(!ptep);
pte = mk_pte(pages[0], pgprot);
set_pte_at(poking_mm, poking_addr, ptep, pte);
if (cross_page_boundary) {
pte = mk_pte(pages[1], pgprot);
set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
}
/*
* Loading the temporary mm behaves as a compiler barrier, which
* guarantees that the PTE will be set at the time memcpy() is done.
*/
prev = use_temporary_mm(poking_mm);
kasan_disable_current();
memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
kasan_enable_current();
/*
* Ensure that the PTE is only cleared after the instructions of memcpy
* were issued by using a compiler barrier.
*/
barrier();
pte_clear(poking_mm, poking_addr, ptep);
if (cross_page_boundary)
pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
/*
* Loading the previous page-table hierarchy requires a serializing
* instruction that already allows the core to see the updated version.
* Xen-PV is assumed to serialize execution in a similar manner.
*/
unuse_temporary_mm(prev);
/*
* Flushing the TLB might involve IPIs, which would require enabled
* IRQs, but not if the mm is not used, as it is in this point.
*/
flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
(cross_page_boundary ? 2 : 1) * PAGE_SIZE,
PAGE_SHIFT, false);
/*
* If the text does not match what we just wrote then something is
* fundamentally screwy; there's nothing we can really do about that.
*/
BUG_ON(memcmp(addr, opcode, len));
pte_unmap_unlock(ptep, ptl);
local_irq_restore(flags); local_irq_restore(flags);
sync_core();
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
return addr; return addr;
} }
...@@ -689,48 +810,36 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, ...@@ -689,48 +810,36 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
* It means the size must be writable atomically and the address must be aligned * It means the size must be writable atomically and the address must be aligned
* in a way that permits an atomic write. It also makes sure we fit on a single * in a way that permits an atomic write. It also makes sure we fit on a single
* page. * page.
*
* Note that the caller must ensure that if the modified code is part of a
* module, the module would not be removed during poking. This can be achieved
* by registering a module notifier, and ordering module removal and patching
* trough a mutex.
*/ */
void *text_poke(void *addr, const void *opcode, size_t len) void *text_poke(void *addr, const void *opcode, size_t len)
{ {
unsigned long flags;
char *vaddr;
struct page *pages[2];
int i;
/*
* While boot memory allocator is runnig we cannot use struct
* pages as they are not yet initialized.
*/
BUG_ON(!after_bootmem);
lockdep_assert_held(&text_mutex); lockdep_assert_held(&text_mutex);
if (!core_kernel_text((unsigned long)addr)) { return __text_poke(addr, opcode, len);
pages[0] = vmalloc_to_page(addr); }
pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
} else { /**
pages[0] = virt_to_page(addr); * text_poke_kgdb - Update instructions on a live kernel by kgdb
WARN_ON(!PageReserved(pages[0])); * @addr: address to modify
pages[1] = virt_to_page(addr + PAGE_SIZE); * @opcode: source of the copy
} * @len: length to copy
BUG_ON(!pages[0]); *
local_irq_save(flags); * Only atomic text poke/set should be allowed when not doing early patching.
set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); * It means the size must be writable atomically and the address must be aligned
if (pages[1]) * in a way that permits an atomic write. It also makes sure we fit on a single
set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); * page.
vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); *
memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); * Context: should only be used by kgdb, which ensures no other core is running,
clear_fixmap(FIX_TEXT_POKE0); * despite the fact it does not hold the text_mutex.
if (pages[1]) */
clear_fixmap(FIX_TEXT_POKE1); void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
local_flush_tlb(); {
sync_core(); return __text_poke(addr, opcode, len);
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
for (i = 0; i < len; i++)
BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
local_irq_restore(flags);
return addr;
} }
static void do_sync_core(void *info) static void do_sync_core(void *info)
...@@ -788,7 +897,7 @@ NOKPROBE_SYMBOL(poke_int3_handler); ...@@ -788,7 +897,7 @@ NOKPROBE_SYMBOL(poke_int3_handler);
* replacing opcode * replacing opcode
* - sync cores * - sync cores
*/ */
void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
{ {
unsigned char int3 = 0xcc; unsigned char int3 = 0xcc;
...@@ -830,7 +939,5 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) ...@@ -830,7 +939,5 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
* the writing of the new instruction. * the writing of the new instruction.
*/ */
bp_patching_in_progress = false; bp_patching_in_progress = false;
return addr;
} }
...@@ -678,12 +678,8 @@ static inline void *alloc_tramp(unsigned long size) ...@@ -678,12 +678,8 @@ static inline void *alloc_tramp(unsigned long size)
{ {
return module_alloc(size); return module_alloc(size);
} }
static inline void tramp_free(void *tramp, int size) static inline void tramp_free(void *tramp)
{ {
int npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
set_memory_nx((unsigned long)tramp, npages);
set_memory_rw((unsigned long)tramp, npages);
module_memfree(tramp); module_memfree(tramp);
} }
#else #else
...@@ -692,7 +688,7 @@ static inline void *alloc_tramp(unsigned long size) ...@@ -692,7 +688,7 @@ static inline void *alloc_tramp(unsigned long size)
{ {
return NULL; return NULL;
} }
static inline void tramp_free(void *tramp, int size) { } static inline void tramp_free(void *tramp) { }
#endif #endif
/* Defined as markers to the end of the ftrace default trampolines */ /* Defined as markers to the end of the ftrace default trampolines */
...@@ -730,6 +726,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ...@@ -730,6 +726,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned long end_offset; unsigned long end_offset;
unsigned long op_offset; unsigned long op_offset;
unsigned long offset; unsigned long offset;
unsigned long npages;
unsigned long size; unsigned long size;
unsigned long retq; unsigned long retq;
unsigned long *ptr; unsigned long *ptr;
...@@ -762,6 +759,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ...@@ -762,6 +759,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
return 0; return 0;
*tramp_size = size + RET_SIZE + sizeof(void *); *tramp_size = size + RET_SIZE + sizeof(void *);
npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
/* Copy ftrace_caller onto the trampoline memory */ /* Copy ftrace_caller onto the trampoline memory */
ret = probe_kernel_read(trampoline, (void *)start_offset, size); ret = probe_kernel_read(trampoline, (void *)start_offset, size);
...@@ -806,9 +804,17 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ...@@ -806,9 +804,17 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* ALLOC_TRAMP flags lets us know we created it */ /* ALLOC_TRAMP flags lets us know we created it */
ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
set_vm_flush_reset_perms(trampoline);
/*
* Module allocation needs to be completed by making the page
* executable. The page is still writable, which is a security hazard,
* but anyhow ftrace breaks W^X completely.
*/
set_memory_x((unsigned long)trampoline, npages);
return (unsigned long)trampoline; return (unsigned long)trampoline;
fail: fail:
tramp_free(trampoline, *tramp_size); tramp_free(trampoline);
return 0; return 0;
} }
...@@ -939,7 +945,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) ...@@ -939,7 +945,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
return; return;
tramp_free((void *)ops->trampoline, ops->trampoline_size); tramp_free((void *)ops->trampoline);
ops->trampoline = 0; ops->trampoline = 0;
} }
......
...@@ -37,7 +37,6 @@ static void bug_at(unsigned char *ip, int line) ...@@ -37,7 +37,6 @@ static void bug_at(unsigned char *ip, int line)
static void __ref __jump_label_transform(struct jump_entry *entry, static void __ref __jump_label_transform(struct jump_entry *entry,
enum jump_label_type type, enum jump_label_type type,
void *(*poker)(void *, const void *, size_t),
int init) int init)
{ {
union jump_code_union jmp; union jump_code_union jmp;
...@@ -50,9 +49,6 @@ static void __ref __jump_label_transform(struct jump_entry *entry, ...@@ -50,9 +49,6 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
jmp.offset = jump_entry_target(entry) - jmp.offset = jump_entry_target(entry) -
(jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
if (early_boot_irqs_disabled)
poker = text_poke_early;
if (type == JUMP_LABEL_JMP) { if (type == JUMP_LABEL_JMP) {
if (init) { if (init) {
expect = default_nop; line = __LINE__; expect = default_nop; line = __LINE__;
...@@ -75,16 +71,19 @@ static void __ref __jump_label_transform(struct jump_entry *entry, ...@@ -75,16 +71,19 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
bug_at((void *)jump_entry_code(entry), line); bug_at((void *)jump_entry_code(entry), line);
/* /*
* Make text_poke_bp() a default fallback poker. * As long as only a single processor is running and the code is still
* not marked as RO, text_poke_early() can be used; Checking that
* system_state is SYSTEM_BOOTING guarantees it. It will be set to
* SYSTEM_SCHEDULING before other cores are awaken and before the
* code is write-protected.
* *
* At the time the change is being done, just ignore whether we * At the time the change is being done, just ignore whether we
* are doing nop -> jump or jump -> nop transition, and assume * are doing nop -> jump or jump -> nop transition, and assume
* always nop being the 'currently valid' instruction * always nop being the 'currently valid' instruction
*
*/ */
if (poker) { if (init || system_state == SYSTEM_BOOTING) {
(*poker)((void *)jump_entry_code(entry), code, text_poke_early((void *)jump_entry_code(entry), code,
JUMP_LABEL_NOP_SIZE); JUMP_LABEL_NOP_SIZE);
return; return;
} }
...@@ -96,7 +95,7 @@ void arch_jump_label_transform(struct jump_entry *entry, ...@@ -96,7 +95,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
enum jump_label_type type) enum jump_label_type type)
{ {
mutex_lock(&text_mutex); mutex_lock(&text_mutex);
__jump_label_transform(entry, type, NULL, 0); __jump_label_transform(entry, type, 0);
mutex_unlock(&text_mutex); mutex_unlock(&text_mutex);
} }
...@@ -126,5 +125,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, ...@@ -126,5 +125,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
jlstate = JL_STATE_NO_UPDATE; jlstate = JL_STATE_NO_UPDATE;
} }
if (jlstate == JL_STATE_UPDATE) if (jlstate == JL_STATE_UPDATE)
__jump_label_transform(entry, type, text_poke_early, 1); __jump_label_transform(entry, type, 1);
} }
...@@ -747,7 +747,6 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) ...@@ -747,7 +747,6 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
{ {
int err; int err;
char opc[BREAK_INSTR_SIZE];
bpt->type = BP_BREAKPOINT; bpt->type = BP_BREAKPOINT;
err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
...@@ -759,18 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) ...@@ -759,18 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
if (!err) if (!err)
return err; return err;
/* /*
* It is safe to call text_poke() because normal kernel execution * It is safe to call text_poke_kgdb() because normal kernel execution
* is stopped on all cores, so long as the text_mutex is not locked. * is stopped on all cores, so long as the text_mutex is not locked.
*/ */
if (mutex_is_locked(&text_mutex)) if (mutex_is_locked(&text_mutex))
return -EBUSY; return -EBUSY;
text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
BREAK_INSTR_SIZE); BREAK_INSTR_SIZE);
err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
if (err)
return err;
if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
return -EINVAL;
bpt->type = BP_POKE_BREAKPOINT; bpt->type = BP_POKE_BREAKPOINT;
return err; return err;
...@@ -778,22 +772,17 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) ...@@ -778,22 +772,17 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
{ {
int err;
char opc[BREAK_INSTR_SIZE];
if (bpt->type != BP_POKE_BREAKPOINT) if (bpt->type != BP_POKE_BREAKPOINT)
goto knl_write; goto knl_write;
/* /*
* It is safe to call text_poke() because normal kernel execution * It is safe to call text_poke_kgdb() because normal kernel execution
* is stopped on all cores, so long as the text_mutex is not locked. * is stopped on all cores, so long as the text_mutex is not locked.
*/ */
if (mutex_is_locked(&text_mutex)) if (mutex_is_locked(&text_mutex))
goto knl_write; goto knl_write;
text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE); text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr,
err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); BREAK_INSTR_SIZE);
if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE)) return 0;
goto knl_write;
return err;
knl_write: knl_write:
return probe_kernel_write((char *)bpt->bpt_addr, return probe_kernel_write((char *)bpt->bpt_addr,
......
...@@ -431,8 +431,21 @@ void *alloc_insn_page(void) ...@@ -431,8 +431,21 @@ void *alloc_insn_page(void)
void *page; void *page;
page = module_alloc(PAGE_SIZE); page = module_alloc(PAGE_SIZE);
if (page) if (!page)
set_memory_ro((unsigned long)page & PAGE_MASK, 1); return NULL;
set_vm_flush_reset_perms(page);
/*
* First make the page read-only, and only then make it executable to
* prevent it from being W+X in between.
*/
set_memory_ro((unsigned long)page, 1);
/*
* TODO: Once additional kernel code protection mechanisms are set, ensure
* that the page was not maliciously altered and it is still zeroed.
*/
set_memory_x((unsigned long)page, 1);
return page; return page;
} }
...@@ -440,8 +453,6 @@ void *alloc_insn_page(void) ...@@ -440,8 +453,6 @@ void *alloc_insn_page(void)
/* Recover page to RW mode before releasing it */ /* Recover page to RW mode before releasing it */
void free_insn_page(void *page) void free_insn_page(void *page)
{ {
set_memory_nx((unsigned long)page & PAGE_MASK, 1);
set_memory_rw((unsigned long)page & PAGE_MASK, 1);
module_memfree(page); module_memfree(page);
} }
......
...@@ -87,7 +87,7 @@ void *module_alloc(unsigned long size) ...@@ -87,7 +87,7 @@ void *module_alloc(unsigned long size)
p = __vmalloc_node_range(size, MODULE_ALIGN, p = __vmalloc_node_range(size, MODULE_ALIGN,
MODULES_VADDR + get_module_load_offset(), MODULES_VADDR + get_module_load_offset(),
MODULES_END, GFP_KERNEL, MODULES_END, GFP_KERNEL,
PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0)); __builtin_return_address(0));
if (p && (kasan_module_alloc(p, size) < 0)) { if (p && (kasan_module_alloc(p, size) < 0)) {
vfree(p); vfree(p);
......
...@@ -141,11 +141,11 @@ SECTIONS ...@@ -141,11 +141,11 @@ SECTIONS
*(.text.__x86.indirect_thunk) *(.text.__x86.indirect_thunk)
__indirect_thunk_end = .; __indirect_thunk_end = .;
#endif #endif
/* End of text section */
_etext = .;
} :text = 0x9090 } :text = 0x9090
/* End of text section */
_etext = .;
NOTES :text :note NOTES :text :note
EXCEPTION_TABLE(16) :text = 0x9090 EXCEPTION_TABLE(16) :text = 0x9090
......
...@@ -360,8 +360,6 @@ static noinline int vmalloc_fault(unsigned long address) ...@@ -360,8 +360,6 @@ static noinline int vmalloc_fault(unsigned long address)
if (!(address >= VMALLOC_START && address < VMALLOC_END)) if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1; return -1;
WARN_ON_ONCE(in_nmi());
/* /*
* Copy kernel mappings over when needed. This can also * Copy kernel mappings over when needed. This can also
* happen within a race in page table update. In the later * happen within a race in page table update. In the later
...@@ -604,24 +602,9 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) ...@@ -604,24 +602,9 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
name, index, addr, (desc.limit0 | (desc.limit1 << 16))); name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
} }
/*
* This helper function transforms the #PF error_code bits into
* "[PROT] [USER]" type of descriptive, almost human-readable error strings:
*/
static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt)
{
if (error_code & mask) {
if (buf[0])
strcat(buf, " ");
strcat(buf, txt);
}
}
static void static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{ {
char err_txt[64];
if (!oops_may_print()) if (!oops_may_print())
return; return;
...@@ -645,31 +628,29 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad ...@@ -645,31 +628,29 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
from_kuid(&init_user_ns, current_uid())); from_kuid(&init_user_ns, current_uid()));
} }
pr_alert("BUG: unable to handle kernel %s at %px\n", if (address < PAGE_SIZE && !user_mode(regs))
address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
(void *)address); (void *)address);
else
err_txt[0] = 0; pr_alert("BUG: unable to handle page fault for address: %px\n",
(void *)address);
/*
* Note: length of these appended strings including the separation space and the pr_alert("#PF: %s %s in %s mode\n",
* zero delimiter must fit into err_txt[]. (error_code & X86_PF_USER) ? "user" : "supervisor",
*/ (error_code & X86_PF_INSTR) ? "instruction fetch" :
err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" ); (error_code & X86_PF_WRITE) ? "write access" :
err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]"); "read access",
err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" ); user_mode(regs) ? "user" : "kernel");
err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" ); pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]"); !(error_code & X86_PF_PROT) ? "not-present page" :
err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" ); (error_code & X86_PF_RSVD) ? "reserved bit violation" :
(error_code & X86_PF_PK) ? "protection keys violation" :
pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]"); "permissions violation");
if (!(error_code & X86_PF_USER) && user_mode(regs)) { if (!(error_code & X86_PF_USER) && user_mode(regs)) {
struct desc_ptr idt, gdt; struct desc_ptr idt, gdt;
u16 ldtr, tr; u16 ldtr, tr;
pr_alert("This was a system access from user code\n");
/* /*
* This can happen for quite a few reasons. The more obvious * This can happen for quite a few reasons. The more obvious
* ones are faults accessing the GDT, or LDT. Perhaps * ones are faults accessing the GDT, or LDT. Perhaps
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <linux/swapfile.h> #include <linux/swapfile.h>
#include <linux/swapops.h> #include <linux/swapops.h>
#include <linux/kmemleak.h> #include <linux/kmemleak.h>
#include <linux/sched/task.h>
#include <asm/set_memory.h> #include <asm/set_memory.h>
#include <asm/e820/api.h> #include <asm/e820/api.h>
...@@ -23,6 +24,7 @@ ...@@ -23,6 +24,7 @@
#include <asm/hypervisor.h> #include <asm/hypervisor.h>
#include <asm/cpufeature.h> #include <asm/cpufeature.h>
#include <asm/pti.h> #include <asm/pti.h>
#include <asm/text-patching.h>
/* /*
* We need to define the tracepoints somewhere, and tlb.c * We need to define the tracepoints somewhere, and tlb.c
...@@ -701,6 +703,41 @@ void __init init_mem_mapping(void) ...@@ -701,6 +703,41 @@ void __init init_mem_mapping(void)
early_memtest(0, max_pfn_mapped << PAGE_SHIFT); early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
} }
/*
* Initialize an mm_struct to be used during poking and a pointer to be used
* during patching.
*/
void __init poking_init(void)
{
spinlock_t *ptl;
pte_t *ptep;
poking_mm = copy_init_mm();
BUG_ON(!poking_mm);
/*
* Randomize the poking address, but make sure that the following page
* will be mapped at the same PMD. We need 2 pages, so find space for 3,
* and adjust the address if the PMD ends after the first one.
*/
poking_addr = TASK_UNMAPPED_BASE;
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
(TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);
if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
poking_addr += PAGE_SIZE;
/*
* We need to trigger the allocation of the page-tables that will be
* needed for poking now. Later, poking may be performed in an atomic
* section, which might cause allocation to fail.
*/
ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
BUG_ON(!ptep);
pte_unmap_unlock(ptep, ptl);
}
/* /*
* devmem_is_allowed() checks to see if /dev/mem access to a certain address * devmem_is_allowed() checks to see if /dev/mem access to a certain address
* is valid. The argument is a physical page number. * is valid. The argument is a physical page number.
......
...@@ -125,10 +125,7 @@ void __init kernel_randomize_memory(void) ...@@ -125,10 +125,7 @@ void __init kernel_randomize_memory(void)
*/ */
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
prandom_bytes_state(&rand_state, &rand, sizeof(rand)); prandom_bytes_state(&rand_state, &rand, sizeof(rand));
if (pgtable_l5_enabled()) entropy = (rand % (entropy + 1)) & PUD_MASK;
entropy = (rand % (entropy + 1)) & P4D_MASK;
else
entropy = (rand % (entropy + 1)) & PUD_MASK;
vaddr += entropy; vaddr += entropy;
*kaslr_regions[i].base = vaddr; *kaslr_regions[i].base = vaddr;
...@@ -137,84 +134,71 @@ void __init kernel_randomize_memory(void) ...@@ -137,84 +134,71 @@ void __init kernel_randomize_memory(void)
* randomization alignment. * randomization alignment.
*/ */
vaddr += get_padding(&kaslr_regions[i]); vaddr += get_padding(&kaslr_regions[i]);
if (pgtable_l5_enabled()) vaddr = round_up(vaddr + 1, PUD_SIZE);
vaddr = round_up(vaddr + 1, P4D_SIZE);
else
vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy; remain_entropy -= entropy;
} }
} }
static void __meminit init_trampoline_pud(void) static void __meminit init_trampoline_pud(void)
{ {
unsigned long paddr, paddr_next; pud_t *pud_page_tramp, *pud, *pud_tramp;
p4d_t *p4d_page_tramp, *p4d, *p4d_tramp;
unsigned long paddr, vaddr;
pgd_t *pgd; pgd_t *pgd;
pud_t *pud_page, *pud_page_tramp;
int i;
pud_page_tramp = alloc_low_page(); pud_page_tramp = alloc_low_page();
/*
* There are two mappings for the low 1MB area, the direct mapping
* and the 1:1 mapping for the real mode trampoline:
*
* Direct mapping: virt_addr = phys_addr + PAGE_OFFSET
* 1:1 mapping: virt_addr = phys_addr
*/
paddr = 0; paddr = 0;
pgd = pgd_offset_k((unsigned long)__va(paddr)); vaddr = (unsigned long)__va(paddr);
pud_page = (pud_t *) pgd_page_vaddr(*pgd); pgd = pgd_offset_k(vaddr);
for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) {
pud_t *pud, *pud_tramp;
unsigned long vaddr = (unsigned long)__va(paddr);
pud_tramp = pud_page_tramp + pud_index(paddr); p4d = p4d_offset(pgd, vaddr);
pud = pud_page + pud_index(vaddr); pud = pud_offset(p4d, vaddr);
paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
*pud_tramp = *pud;
}
set_pgd(&trampoline_pgd_entry, pud_tramp = pud_page_tramp + pud_index(paddr);
__pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); *pud_tramp = *pud;
}
static void __meminit init_trampoline_p4d(void)
{
unsigned long paddr, paddr_next;
pgd_t *pgd;
p4d_t *p4d_page, *p4d_page_tramp;
int i;
p4d_page_tramp = alloc_low_page(); if (pgtable_l5_enabled()) {
p4d_page_tramp = alloc_low_page();
paddr = 0;
pgd = pgd_offset_k((unsigned long)__va(paddr));
p4d_page = (p4d_t *) pgd_page_vaddr(*pgd);
for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) {
p4d_t *p4d, *p4d_tramp;
unsigned long vaddr = (unsigned long)__va(paddr);
p4d_tramp = p4d_page_tramp + p4d_index(paddr); p4d_tramp = p4d_page_tramp + p4d_index(paddr);
p4d = p4d_page + p4d_index(vaddr);
paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
*p4d_tramp = *p4d; set_p4d(p4d_tramp,
} __p4d(_KERNPG_TABLE | __pa(pud_page_tramp)));
set_pgd(&trampoline_pgd_entry, set_pgd(&trampoline_pgd_entry,
__pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
} else {
set_pgd(&trampoline_pgd_entry,
__pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
}
} }
/* /*
* Create PGD aligned trampoline table to allow real mode initialization * The real mode trampoline, which is required for bootstrapping CPUs
* of additional CPUs. Consume only 1 low memory page. * occupies only a small area under the low 1MB. See reserve_real_mode()
* for details.
*
* If KASLR is disabled the first PGD entry of the direct mapping is copied
* to map the real mode trampoline.
*
* If KASLR is enabled, copy only the PUD which covers the low 1MB
* area. This limits the randomization granularity to 1GB for both 4-level
* and 5-level paging.
*/ */
void __meminit init_trampoline(void) void __meminit init_trampoline(void)
{ {
if (!kaslr_memory_enabled()) { if (!kaslr_memory_enabled()) {
init_trampoline_default(); init_trampoline_default();
return; return;
} }
if (pgtable_l5_enabled()) init_trampoline_pud();
init_trampoline_p4d();
else
init_trampoline_pud();
} }
...@@ -2209,8 +2209,6 @@ int set_pages_rw(struct page *page, int numpages) ...@@ -2209,8 +2209,6 @@ int set_pages_rw(struct page *page, int numpages)
return set_memory_rw(addr, numpages); return set_memory_rw(addr, numpages);
} }
#ifdef CONFIG_DEBUG_PAGEALLOC
static int __set_pages_p(struct page *page, int numpages) static int __set_pages_p(struct page *page, int numpages)
{ {
unsigned long tempaddr = (unsigned long) page_address(page); unsigned long tempaddr = (unsigned long) page_address(page);
...@@ -2249,6 +2247,16 @@ static int __set_pages_np(struct page *page, int numpages) ...@@ -2249,6 +2247,16 @@ static int __set_pages_np(struct page *page, int numpages)
return __change_page_attr_set_clr(&cpa, 0); return __change_page_attr_set_clr(&cpa, 0);
} }
int set_direct_map_invalid_noflush(struct page *page)
{
return __set_pages_np(page, 1);
}
int set_direct_map_default_noflush(struct page *page)
{
return __set_pages_p(page, 1);
}
void __kernel_map_pages(struct page *page, int numpages, int enable) void __kernel_map_pages(struct page *page, int numpages, int enable)
{ {
if (PageHighMem(page)) if (PageHighMem(page))
...@@ -2282,7 +2290,6 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) ...@@ -2282,7 +2290,6 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
} }
#ifdef CONFIG_HIBERNATION #ifdef CONFIG_HIBERNATION
bool kernel_page_present(struct page *page) bool kernel_page_present(struct page *page)
{ {
unsigned int level; unsigned int level;
...@@ -2294,11 +2301,8 @@ bool kernel_page_present(struct page *page) ...@@ -2294,11 +2301,8 @@ bool kernel_page_present(struct page *page)
pte = lookup_address((unsigned long)page_address(page), &level); pte = lookup_address((unsigned long)page_address(page), &level);
return (pte_val(*pte) & _PAGE_PRESENT); return (pte_val(*pte) & _PAGE_PRESENT);
} }
#endif /* CONFIG_HIBERNATION */ #endif /* CONFIG_HIBERNATION */
#endif /* CONFIG_DEBUG_PAGEALLOC */
int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
unsigned numpages, unsigned long page_flags) unsigned numpages, unsigned long page_flags)
{ {
......
...@@ -373,14 +373,14 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, ...@@ -373,14 +373,14 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
static struct kmem_cache *pgd_cache; static struct kmem_cache *pgd_cache;
static int __init pgd_cache_init(void) void __init pgd_cache_init(void)
{ {
/* /*
* When PAE kernel is running as a Xen domain, it does not use * When PAE kernel is running as a Xen domain, it does not use
* shared kernel pmd. And this requires a whole page for pgd. * shared kernel pmd. And this requires a whole page for pgd.
*/ */
if (!SHARED_KERNEL_PMD) if (!SHARED_KERNEL_PMD)
return 0; return;
/* /*
* when PAE kernel is not running as a Xen domain, it uses * when PAE kernel is not running as a Xen domain, it uses
...@@ -390,9 +390,7 @@ static int __init pgd_cache_init(void) ...@@ -390,9 +390,7 @@ static int __init pgd_cache_init(void)
*/ */
pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
SLAB_PANIC, NULL); SLAB_PANIC, NULL);
return 0;
} }
core_initcall(pgd_cache_init);
static inline pgd_t *_pgd_alloc(void) static inline pgd_t *_pgd_alloc(void)
{ {
...@@ -420,6 +418,10 @@ static inline void _pgd_free(pgd_t *pgd) ...@@ -420,6 +418,10 @@ static inline void _pgd_free(pgd_t *pgd)
} }
#else #else
void __init pgd_cache_init(void)
{
}
static inline pgd_t *_pgd_alloc(void) static inline pgd_t *_pgd_alloc(void)
{ {
return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
......
...@@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, ...@@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
} }
static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
{ {
const struct flush_tlb_info *f = info; const struct flush_tlb_info *f = info;
...@@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask, ...@@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
*/ */
unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
#ifdef CONFIG_DEBUG_VM
static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
#endif
static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
unsigned long start, unsigned long end,
unsigned int stride_shift, bool freed_tables,
u64 new_tlb_gen)
{
struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
#ifdef CONFIG_DEBUG_VM
/*
* Ensure that the following code is non-reentrant and flush_tlb_info
* is not overwritten. This means no TLB flushing is initiated by
* interrupt handlers and machine-check exception handlers.
*/
BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
#endif
info->start = start;
info->end = end;
info->mm = mm;
info->stride_shift = stride_shift;
info->freed_tables = freed_tables;
info->new_tlb_gen = new_tlb_gen;
return info;
}
static inline void put_flush_tlb_info(void)
{
#ifdef CONFIG_DEBUG_VM
/* Complete reentrency prevention checks */
barrier();
this_cpu_dec(flush_tlb_info_idx);
#endif
}
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned int stride_shift, unsigned long end, unsigned int stride_shift,
bool freed_tables) bool freed_tables)
{ {
struct flush_tlb_info *info;
u64 new_tlb_gen;
int cpu; int cpu;
struct flush_tlb_info info = {
.mm = mm,
.stride_shift = stride_shift,
.freed_tables = freed_tables,
};
cpu = get_cpu(); cpu = get_cpu();
/* This is also a barrier that synchronizes with switch_mm(). */
info.new_tlb_gen = inc_mm_tlb_gen(mm);
/* Should we flush just the requested range? */ /* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) && if ((end == TLB_FLUSH_ALL) ||
((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) { ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
info.start = start; start = 0;
info.end = end; end = TLB_FLUSH_ALL;
} else {
info.start = 0UL;
info.end = TLB_FLUSH_ALL;
} }
/* This is also a barrier that synchronizes with switch_mm(). */
new_tlb_gen = inc_mm_tlb_gen(mm);
info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
new_tlb_gen);
if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
VM_WARN_ON(irqs_disabled()); lockdep_assert_irqs_enabled();
local_irq_disable(); local_irq_disable();
flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN);
local_irq_enable(); local_irq_enable();
} }
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), &info); flush_tlb_others(mm_cpumask(mm), info);
put_flush_tlb_info();
put_cpu(); put_cpu();
} }
...@@ -787,38 +825,48 @@ static void do_kernel_range_flush(void *info) ...@@ -787,38 +825,48 @@ static void do_kernel_range_flush(void *info)
void flush_tlb_kernel_range(unsigned long start, unsigned long end) void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{ {
/* Balance as user space task's flush, a bit conservative */ /* Balance as user space task's flush, a bit conservative */
if (end == TLB_FLUSH_ALL || if (end == TLB_FLUSH_ALL ||
(end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
on_each_cpu(do_flush_tlb_all, NULL, 1); on_each_cpu(do_flush_tlb_all, NULL, 1);
} else { } else {
struct flush_tlb_info info; struct flush_tlb_info *info;
info.start = start;
info.end = end; preempt_disable();
on_each_cpu(do_kernel_range_flush, &info, 1); info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
on_each_cpu(do_kernel_range_flush, info, 1);
put_flush_tlb_info();
preempt_enable();
} }
} }
/*
* arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
* This means that the 'struct flush_tlb_info' that describes which mappings to
* flush is actually fixed. We therefore set a single fixed struct and use it in
* arch_tlbbatch_flush().
*/
static const struct flush_tlb_info full_flush_tlb_info = {
.mm = NULL,
.start = 0,
.end = TLB_FLUSH_ALL,
};
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{ {
struct flush_tlb_info info = {
.mm = NULL,
.start = 0UL,
.end = TLB_FLUSH_ALL,
};
int cpu = get_cpu(); int cpu = get_cpu();
if (cpumask_test_cpu(cpu, &batch->cpumask)) { if (cpumask_test_cpu(cpu, &batch->cpumask)) {
VM_WARN_ON(irqs_disabled()); lockdep_assert_irqs_enabled();
local_irq_disable(); local_irq_disable();
flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN);
local_irq_enable(); local_irq_enable();
} }
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
flush_tlb_others(&batch->cpumask, &info); flush_tlb_others(&batch->cpumask, &full_flush_tlb_info);
cpumask_clear(&batch->cpumask); cpumask_clear(&batch->cpumask);
......
...@@ -2318,8 +2318,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) ...@@ -2318,8 +2318,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#elif defined(CONFIG_X86_VSYSCALL_EMULATION) #elif defined(CONFIG_X86_VSYSCALL_EMULATION)
case VSYSCALL_PAGE: case VSYSCALL_PAGE:
#endif #endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
/* All local page mappings */ /* All local page mappings */
pte = pfn_pte(phys, prot); pte = pfn_pte(phys, prot);
break; break;
......
...@@ -1126,6 +1126,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, ...@@ -1126,6 +1126,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
static inline void init_espfix_bsp(void) { } static inline void init_espfix_bsp(void) { }
#endif #endif
extern void __init pgd_cache_init(void);
#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
{ {
......
...@@ -21,6 +21,15 @@ ...@@ -21,6 +21,15 @@
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/cacheflush.h> #include <asm/cacheflush.h>
/*
* Blindly accessing user memory from NMI context can be dangerous
* if we're in the middle of switching the current user task or switching
* the loaded mm.
*/
#ifndef nmi_uaccess_okay
# define nmi_uaccess_okay() true
#endif
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
/* /*
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <linux/set_memory.h> #include <linux/set_memory.h>
#include <linux/kallsyms.h> #include <linux/kallsyms.h>
#include <linux/if_vlan.h> #include <linux/if_vlan.h>
#include <linux/vmalloc.h>
#include <net/sch_generic.h> #include <net/sch_generic.h>
...@@ -503,7 +504,6 @@ struct bpf_prog { ...@@ -503,7 +504,6 @@ struct bpf_prog {
u16 pages; /* Number of allocated pages */ u16 pages; /* Number of allocated pages */
u16 jited:1, /* Is our filter JIT'ed? */ u16 jited:1, /* Is our filter JIT'ed? */
jit_requested:1,/* archs need to JIT the prog */ jit_requested:1,/* archs need to JIT the prog */
undo_set_mem:1, /* Passed set_memory_ro() checkpoint */
gpl_compatible:1, /* Is filter GPL compatible? */ gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */ cb_access:1, /* Is control block accessed? */
dst_needed:1, /* Do we need dst entry? */ dst_needed:1, /* Do we need dst entry? */
...@@ -733,24 +733,15 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) ...@@ -733,24 +733,15 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
static inline void bpf_prog_lock_ro(struct bpf_prog *fp) static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
{ {
fp->undo_set_mem = 1; set_vm_flush_reset_perms(fp);
set_memory_ro((unsigned long)fp, fp->pages); set_memory_ro((unsigned long)fp, fp->pages);
} }
static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
{
if (fp->undo_set_mem)
set_memory_rw((unsigned long)fp, fp->pages);
}
static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{ {
set_vm_flush_reset_perms(hdr);
set_memory_ro((unsigned long)hdr, hdr->pages); set_memory_ro((unsigned long)hdr, hdr->pages);
} set_memory_x((unsigned long)hdr, hdr->pages);
static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
{
set_memory_rw((unsigned long)hdr, hdr->pages);
} }
static inline struct bpf_binary_header * static inline struct bpf_binary_header *
...@@ -788,7 +779,6 @@ void __bpf_prog_free(struct bpf_prog *fp); ...@@ -788,7 +779,6 @@ void __bpf_prog_free(struct bpf_prog *fp);
static inline void bpf_prog_unlock_free(struct bpf_prog *fp) static inline void bpf_prog_unlock_free(struct bpf_prog *fp)
{ {
bpf_prog_unlock_ro(fp);
__bpf_prog_free(fp); __bpf_prog_free(fp);
} }
......
...@@ -2610,37 +2610,31 @@ static inline void kernel_poison_pages(struct page *page, int numpages, ...@@ -2610,37 +2610,31 @@ static inline void kernel_poison_pages(struct page *page, int numpages,
int enable) { } int enable) { }
#endif #endif
#ifdef CONFIG_DEBUG_PAGEALLOC
extern bool _debug_pagealloc_enabled; extern bool _debug_pagealloc_enabled;
extern void __kernel_map_pages(struct page *page, int numpages, int enable);
static inline bool debug_pagealloc_enabled(void) static inline bool debug_pagealloc_enabled(void)
{ {
return _debug_pagealloc_enabled; return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled;
} }
#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP)
extern void __kernel_map_pages(struct page *page, int numpages, int enable);
static inline void static inline void
kernel_map_pages(struct page *page, int numpages, int enable) kernel_map_pages(struct page *page, int numpages, int enable)
{ {
if (!debug_pagealloc_enabled())
return;
__kernel_map_pages(page, numpages, enable); __kernel_map_pages(page, numpages, enable);
} }
#ifdef CONFIG_HIBERNATION #ifdef CONFIG_HIBERNATION
extern bool kernel_page_present(struct page *page); extern bool kernel_page_present(struct page *page);
#endif /* CONFIG_HIBERNATION */ #endif /* CONFIG_HIBERNATION */
#else /* CONFIG_DEBUG_PAGEALLOC */ #else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
static inline void static inline void
kernel_map_pages(struct page *page, int numpages, int enable) {} kernel_map_pages(struct page *page, int numpages, int enable) {}
#ifdef CONFIG_HIBERNATION #ifdef CONFIG_HIBERNATION
static inline bool kernel_page_present(struct page *page) { return true; } static inline bool kernel_page_present(struct page *page) { return true; }
#endif /* CONFIG_HIBERNATION */ #endif /* CONFIG_HIBERNATION */
static inline bool debug_pagealloc_enabled(void) #endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
{
return false;
}
#endif /* CONFIG_DEBUG_PAGEALLOC */
#ifdef __HAVE_ARCH_GATE_AREA #ifdef __HAVE_ARCH_GATE_AREA
extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
......
...@@ -76,6 +76,7 @@ extern void exit_itimers(struct signal_struct *); ...@@ -76,6 +76,7 @@ extern void exit_itimers(struct signal_struct *);
extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
struct task_struct *fork_idle(int); struct task_struct *fork_idle(int);
struct mm_struct *copy_init_mm(void);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
......
...@@ -17,6 +17,17 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } ...@@ -17,6 +17,17 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
#endif #endif
#ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP
static inline int set_direct_map_invalid_noflush(struct page *page)
{
return 0;
}
static inline int set_direct_map_default_noflush(struct page *page)
{
return 0;
}
#endif
#ifndef set_mce_nospec #ifndef set_mce_nospec
static inline int set_mce_nospec(unsigned long pfn) static inline int set_mce_nospec(unsigned long pfn)
{ {
......
...@@ -115,6 +115,7 @@ struct uprobes_state { ...@@ -115,6 +115,7 @@ struct uprobes_state {
struct xol_area *xol_area; struct xol_area *xol_area;
}; };
extern void __init uprobes_init(void);
extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_swbp_insn(uprobe_opcode_t *insn);
...@@ -154,6 +155,10 @@ extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, ...@@ -154,6 +155,10 @@ extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
struct uprobes_state { struct uprobes_state {
}; };
static inline void uprobes_init(void)
{
}
#define uprobe_get_trap_addr(regs) instruction_pointer(regs) #define uprobe_get_trap_addr(regs) instruction_pointer(regs)
static inline int static inline int
......
...@@ -21,6 +21,11 @@ struct notifier_block; /* in notifier.h */ ...@@ -21,6 +21,11 @@ struct notifier_block; /* in notifier.h */
#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
#define VM_NO_GUARD 0x00000040 /* don't add guard page */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
/*
* Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with
* vfree_atomic().
*/
#define VM_FLUSH_RESET_PERMS 0x00000100 /* Reset direct map and flush TLB on unmap */
/* bits [20..32] reserved for arch specific ioremap internals */ /* bits [20..32] reserved for arch specific ioremap internals */
/* /*
...@@ -142,6 +147,13 @@ extern int map_kernel_range_noflush(unsigned long start, unsigned long size, ...@@ -142,6 +147,13 @@ extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
pgprot_t prot, struct page **pages); pgprot_t prot, struct page **pages);
extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
extern void unmap_kernel_range(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size);
static inline void set_vm_flush_reset_perms(void *addr)
{
struct vm_struct *vm = find_vm_area(addr);
if (vm)
vm->flags |= VM_FLUSH_RESET_PERMS;
}
#else #else
static inline int static inline int
map_kernel_range_noflush(unsigned long start, unsigned long size, map_kernel_range_noflush(unsigned long start, unsigned long size,
...@@ -157,6 +169,9 @@ static inline void ...@@ -157,6 +169,9 @@ static inline void
unmap_kernel_range(unsigned long addr, unsigned long size) unmap_kernel_range(unsigned long addr, unsigned long size)
{ {
} }
static inline void set_vm_flush_reset_perms(void *addr)
{
}
#endif #endif
/* Allocate/destroy a 'vmalloc' VM area. */ /* Allocate/destroy a 'vmalloc' VM area. */
......
...@@ -504,6 +504,10 @@ void __init __weak thread_stack_cache_init(void) ...@@ -504,6 +504,10 @@ void __init __weak thread_stack_cache_init(void)
void __init __weak mem_encrypt_init(void) { } void __init __weak mem_encrypt_init(void) { }
void __init __weak poking_init(void) { }
void __init __weak pgd_cache_init(void) { }
bool initcall_debug; bool initcall_debug;
core_param(initcall_debug, initcall_debug, bool, 0644); core_param(initcall_debug, initcall_debug, bool, 0644);
...@@ -535,6 +539,7 @@ static void __init mm_init(void) ...@@ -535,6 +539,7 @@ static void __init mm_init(void)
init_espfix_bsp(); init_espfix_bsp();
/* Should be run after espfix64 is set up. */ /* Should be run after espfix64 is set up. */
pti_init(); pti_init();
pgd_cache_init();
} }
void __init __weak arch_call_rest_init(void) void __init __weak arch_call_rest_init(void)
...@@ -737,6 +742,7 @@ asmlinkage __visible void __init start_kernel(void) ...@@ -737,6 +742,7 @@ asmlinkage __visible void __init start_kernel(void)
taskstats_init_early(); taskstats_init_early();
delayacct_init(); delayacct_init();
poking_init();
check_bugs(); check_bugs();
acpi_subsystem_init(); acpi_subsystem_init();
......
...@@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) ...@@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
if (fp->jited) { if (fp->jited) {
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
bpf_jit_binary_unlock_ro(hdr);
bpf_jit_binary_free(hdr); bpf_jit_binary_free(hdr);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
......
...@@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = { ...@@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = {
.priority = INT_MAX-1, /* notified after kprobes, kgdb */ .priority = INT_MAX-1, /* notified after kprobes, kgdb */
}; };
static int __init init_uprobes(void) void __init uprobes_init(void)
{ {
int i; int i;
for (i = 0; i < UPROBES_HASH_SZ; i++) for (i = 0; i < UPROBES_HASH_SZ; i++)
mutex_init(&uprobes_mmap_mutex[i]); mutex_init(&uprobes_mmap_mutex[i]);
if (percpu_init_rwsem(&dup_mmap_sem)) BUG_ON(percpu_init_rwsem(&dup_mmap_sem));
return -ENOMEM;
return register_die_notifier(&uprobe_exception_nb); BUG_ON(register_die_notifier(&uprobe_exception_nb));
} }
__initcall(init_uprobes);
...@@ -815,6 +815,7 @@ void __init fork_init(void) ...@@ -815,6 +815,7 @@ void __init fork_init(void)
#endif #endif
lockdep_init_task(&init_task); lockdep_init_task(&init_task);
uprobes_init();
} }
int __weak arch_dup_task_struct(struct task_struct *dst, int __weak arch_dup_task_struct(struct task_struct *dst,
...@@ -1298,13 +1299,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) ...@@ -1298,13 +1299,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
complete_vfork_done(tsk); complete_vfork_done(tsk);
} }
/* /**
* Allocate a new mm structure and copy contents from the * dup_mm() - duplicates an existing mm structure
* mm structure of the passed in task structure. * @tsk: the task_struct with which the new mm will be associated.
* @oldmm: the mm to duplicate.
*
* Allocates a new mm structure and duplicates the provided @oldmm structure
* content into it.
*
* Return: the duplicated mm or NULL on failure.
*/ */
static struct mm_struct *dup_mm(struct task_struct *tsk) static struct mm_struct *dup_mm(struct task_struct *tsk,
struct mm_struct *oldmm)
{ {
struct mm_struct *mm, *oldmm = current->mm; struct mm_struct *mm;
int err; int err;
mm = allocate_mm(); mm = allocate_mm();
...@@ -1371,7 +1379,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) ...@@ -1371,7 +1379,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
} }
retval = -ENOMEM; retval = -ENOMEM;
mm = dup_mm(tsk); mm = dup_mm(tsk, current->mm);
if (!mm) if (!mm)
goto fail_nomem; goto fail_nomem;
...@@ -2186,6 +2194,11 @@ struct task_struct *fork_idle(int cpu) ...@@ -2186,6 +2194,11 @@ struct task_struct *fork_idle(int cpu)
return task; return task;
} }
struct mm_struct *copy_init_mm(void)
{
return dup_mm(NULL, &init_mm);
}
/* /*
* Ok, this is the main fork-routine. * Ok, this is the main fork-routine.
* *
......
...@@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex); ...@@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex); EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules); static LIST_HEAD(modules);
/* Work queue for freeing init sections in success case */
static struct work_struct init_free_wq;
static struct llist_head init_free_list;
#ifdef CONFIG_MODULES_TREE_LOOKUP #ifdef CONFIG_MODULES_TREE_LOOKUP
/* /*
...@@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init) ...@@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init)
if (!rodata_enabled) if (!rodata_enabled)
return; return;
set_vm_flush_reset_perms(mod->core_layout.base);
set_vm_flush_reset_perms(mod->init_layout.base);
frob_text(&mod->core_layout, set_memory_ro); frob_text(&mod->core_layout, set_memory_ro);
frob_text(&mod->core_layout, set_memory_x);
frob_rodata(&mod->core_layout, set_memory_ro); frob_rodata(&mod->core_layout, set_memory_ro);
frob_text(&mod->init_layout, set_memory_ro); frob_text(&mod->init_layout, set_memory_ro);
frob_text(&mod->init_layout, set_memory_x);
frob_rodata(&mod->init_layout, set_memory_ro); frob_rodata(&mod->init_layout, set_memory_ro);
if (after_init) if (after_init)
...@@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod) ...@@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod)
frob_writable_data(&mod->init_layout, set_memory_nx); frob_writable_data(&mod->init_layout, set_memory_nx);
} }
static void module_disable_nx(const struct module *mod)
{
frob_rodata(&mod->core_layout, set_memory_x);
frob_ro_after_init(&mod->core_layout, set_memory_x);
frob_writable_data(&mod->core_layout, set_memory_x);
frob_rodata(&mod->init_layout, set_memory_x);
frob_writable_data(&mod->init_layout, set_memory_x);
}
/* Iterate through all modules and set each module's text as RW */ /* Iterate through all modules and set each module's text as RW */
void set_all_modules_text_rw(void) void set_all_modules_text_rw(void)
{ {
...@@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void) ...@@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void)
} }
mutex_unlock(&module_mutex); mutex_unlock(&module_mutex);
} }
static void disable_ro_nx(const struct module_layout *layout)
{
if (rodata_enabled) {
frob_text(layout, set_memory_rw);
frob_rodata(layout, set_memory_rw);
frob_ro_after_init(layout, set_memory_rw);
}
frob_rodata(layout, set_memory_x);
frob_ro_after_init(layout, set_memory_x);
frob_writable_data(layout, set_memory_x);
}
#else #else
static void disable_ro_nx(const struct module_layout *layout) { }
static void module_enable_nx(const struct module *mod) { } static void module_enable_nx(const struct module *mod) { }
static void module_disable_nx(const struct module *mod) { }
#endif #endif
#ifdef CONFIG_LIVEPATCH #ifdef CONFIG_LIVEPATCH
...@@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod) ...@@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod)
void __weak module_memfree(void *module_region) void __weak module_memfree(void *module_region)
{ {
/*
* This memory may be RO, and freeing RO memory in an interrupt is not
* supported by vmalloc.
*/
WARN_ON(in_interrupt());
vfree(module_region); vfree(module_region);
} }
...@@ -2166,7 +2158,6 @@ static void free_module(struct module *mod) ...@@ -2166,7 +2158,6 @@ static void free_module(struct module *mod)
mutex_unlock(&module_mutex); mutex_unlock(&module_mutex);
/* This may be empty, but that's OK */ /* This may be empty, but that's OK */
disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod); module_arch_freeing_init(mod);
module_memfree(mod->init_layout.base); module_memfree(mod->init_layout.base);
kfree(mod->args); kfree(mod->args);
...@@ -2176,7 +2167,6 @@ static void free_module(struct module *mod) ...@@ -2176,7 +2167,6 @@ static void free_module(struct module *mod)
lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
/* Finally, free the core (containing the module structure) */ /* Finally, free the core (containing the module structure) */
disable_ro_nx(&mod->core_layout);
module_memfree(mod->core_layout.base); module_memfree(mod->core_layout.base);
} }
...@@ -3415,17 +3405,34 @@ static void do_mod_ctors(struct module *mod) ...@@ -3415,17 +3405,34 @@ static void do_mod_ctors(struct module *mod)
/* For freeing module_init on success, in case kallsyms traversing */ /* For freeing module_init on success, in case kallsyms traversing */
struct mod_initfree { struct mod_initfree {
struct rcu_head rcu; struct llist_node node;
void *module_init; void *module_init;
}; };
static void do_free_init(struct rcu_head *head) static void do_free_init(struct work_struct *w)
{ {
struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); struct llist_node *pos, *n, *list;
module_memfree(m->module_init); struct mod_initfree *initfree;
kfree(m);
list = llist_del_all(&init_free_list);
synchronize_rcu();
llist_for_each_safe(pos, n, list) {
initfree = container_of(pos, struct mod_initfree, node);
module_memfree(initfree->module_init);
kfree(initfree);
}
} }
static int __init modules_wq_init(void)
{
INIT_WORK(&init_free_wq, do_free_init);
init_llist_head(&init_free_list);
return 0;
}
module_init(modules_wq_init);
/* /*
* This is where the real work happens. * This is where the real work happens.
* *
...@@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod) ...@@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod)
#endif #endif
module_enable_ro(mod, true); module_enable_ro(mod, true);
mod_tree_remove_init(mod); mod_tree_remove_init(mod);
disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod); module_arch_freeing_init(mod);
mod->init_layout.base = NULL; mod->init_layout.base = NULL;
mod->init_layout.size = 0; mod->init_layout.size = 0;
...@@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod) ...@@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod)
* We want to free module_init, but be aware that kallsyms may be * We want to free module_init, but be aware that kallsyms may be
* walking this with preempt disabled. In all the failure paths, we * walking this with preempt disabled. In all the failure paths, we
* call synchronize_rcu(), but we don't want to slow down the success * call synchronize_rcu(), but we don't want to slow down the success
* path, so use actual RCU here. * path. module_memfree() cannot be called in an interrupt, so do the
* work and call synchronize_rcu() in a work queue.
*
* Note that module_alloc() on most architectures creates W+X page * Note that module_alloc() on most architectures creates W+X page
* mappings which won't be cleaned up until do_free_init() runs. Any * mappings which won't be cleaned up until do_free_init() runs. Any
* code such as mark_rodata_ro() which depends on those mappings to * code such as mark_rodata_ro() which depends on those mappings to
* be cleaned up needs to sync with the queued work - ie * be cleaned up needs to sync with the queued work - ie
* rcu_barrier() * rcu_barrier()
*/ */
call_rcu(&freeinit->rcu, do_free_init); if (llist_add(&freeinit->node, &init_free_list))
schedule_work(&init_free_wq);
mutex_unlock(&module_mutex); mutex_unlock(&module_mutex);
wake_up_all(&module_wq); wake_up_all(&module_wq);
...@@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs, ...@@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_bug_cleanup(mod); module_bug_cleanup(mod);
mutex_unlock(&module_mutex); mutex_unlock(&module_mutex);
/* we can't deallocate the module until we clear memory protection */
module_disable_ro(mod);
module_disable_nx(mod);
ddebug_cleanup: ddebug_cleanup:
ftrace_release_mod(mod); ftrace_release_mod(mod);
dynamic_debug_remove(mod, info->debug); dynamic_debug_remove(mod, info->debug);
......
...@@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src) ...@@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src)
* safe_copy_page - Copy a page in a safe way. * safe_copy_page - Copy a page in a safe way.
* *
* Check if the page we are going to copy is marked as present in the kernel * Check if the page we are going to copy is marked as present in the kernel
* page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or
* and in that case kernel_page_present() always returns 'true'). * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present()
* always returns 'true'.
*/ */
static void safe_copy_page(void *dst, struct page *s_page) static void safe_copy_page(void *dst, struct page *s_page)
{ {
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include <linux/syscalls.h> #include <linux/syscalls.h>
#include <linux/error-injection.h> #include <linux/error-injection.h>
#include <asm/tlb.h>
#include "trace_probe.h" #include "trace_probe.h"
#include "trace.h" #include "trace.h"
...@@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, ...@@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
* access_ok() should prevent writing to non-user memory, but in * access_ok() should prevent writing to non-user memory, but in
* some situations (nommu, temporary switch, etc) access_ok() does * some situations (nommu, temporary switch, etc) access_ok() does
* not provide enough validation, hence the check on KERNEL_DS. * not provide enough validation, hence the check on KERNEL_DS.
*
* nmi_uaccess_okay() ensures the probe is not run in an interim
* state, when the task or mm are switched. This is specifically
* required to prevent the use of temporary mm.
*/ */
if (unlikely(in_interrupt() || if (unlikely(in_interrupt() ||
...@@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, ...@@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
return -EPERM; return -EPERM;
if (unlikely(uaccess_kernel())) if (unlikely(uaccess_kernel()))
return -EPERM; return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
if (!access_ok(unsafe_ptr, size)) if (!access_ok(unsafe_ptr, size))
return -EPERM; return -EPERM;
......
...@@ -1144,7 +1144,9 @@ static __always_inline bool free_pages_prepare(struct page *page, ...@@ -1144,7 +1144,9 @@ static __always_inline bool free_pages_prepare(struct page *page,
} }
arch_free_page(page, order); arch_free_page(page, order);
kernel_poison_pages(page, 1 << order, 0); kernel_poison_pages(page, 1 << order, 0);
kernel_map_pages(page, 1 << order, 0); if (debug_pagealloc_enabled())
kernel_map_pages(page, 1 << order, 0);
kasan_free_nondeferred_pages(page, order); kasan_free_nondeferred_pages(page, order);
return true; return true;
...@@ -2014,7 +2016,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, ...@@ -2014,7 +2016,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
set_page_refcounted(page); set_page_refcounted(page);
arch_alloc_page(page, order); arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1); if (debug_pagealloc_enabled())
kernel_map_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order); kasan_alloc_pages(page, order);
kernel_poison_pages(page, 1 << order, 1); kernel_poison_pages(page, 1 << order, 1);
set_page_owner(page, order, gfp_flags); set_page_owner(page, order, gfp_flags);
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <linux/interrupt.h> #include <linux/interrupt.h>
#include <linux/proc_fs.h> #include <linux/proc_fs.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/set_memory.h>
#include <linux/debugobjects.h> #include <linux/debugobjects.h>
#include <linux/kallsyms.h> #include <linux/kallsyms.h>
#include <linux/list.h> #include <linux/list.h>
...@@ -1059,24 +1060,9 @@ static void vb_free(const void *addr, unsigned long size) ...@@ -1059,24 +1060,9 @@ static void vb_free(const void *addr, unsigned long size)
spin_unlock(&vb->lock); spin_unlock(&vb->lock);
} }
/** static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
* vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
*
* The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
* to amortize TLB flushing overheads. What this means is that any page you
* have now, may, in a former life, have been mapped into kernel virtual
* address by the vmap layer and so there might be some CPUs with TLB entries
* still referencing that page (additional to the regular 1:1 kernel mapping).
*
* vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
* be sure that none of the pages we have control over will have any aliases
* from the vmap layer.
*/
void vm_unmap_aliases(void)
{ {
unsigned long start = ULONG_MAX, end = 0;
int cpu; int cpu;
int flush = 0;
if (unlikely(!vmap_initialized)) if (unlikely(!vmap_initialized))
return; return;
...@@ -1113,6 +1099,27 @@ void vm_unmap_aliases(void) ...@@ -1113,6 +1099,27 @@ void vm_unmap_aliases(void)
flush_tlb_kernel_range(start, end); flush_tlb_kernel_range(start, end);
mutex_unlock(&vmap_purge_lock); mutex_unlock(&vmap_purge_lock);
} }
/**
* vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
*
* The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
* to amortize TLB flushing overheads. What this means is that any page you
* have now, may, in a former life, have been mapped into kernel virtual
* address by the vmap layer and so there might be some CPUs with TLB entries
* still referencing that page (additional to the regular 1:1 kernel mapping).
*
* vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
* be sure that none of the pages we have control over will have any aliases
* from the vmap layer.
*/
void vm_unmap_aliases(void)
{
unsigned long start = ULONG_MAX, end = 0;
int flush = 0;
_vm_unmap_aliases(start, end, flush);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases); EXPORT_SYMBOL_GPL(vm_unmap_aliases);
/** /**
...@@ -1505,6 +1512,72 @@ struct vm_struct *remove_vm_area(const void *addr) ...@@ -1505,6 +1512,72 @@ struct vm_struct *remove_vm_area(const void *addr)
return NULL; return NULL;
} }
static inline void set_area_direct_map(const struct vm_struct *area,
int (*set_direct_map)(struct page *page))
{
int i;
for (i = 0; i < area->nr_pages; i++)
if (page_address(area->pages[i]))
set_direct_map(area->pages[i]);
}
/* Handle removing and resetting vm mappings related to the vm_struct. */
static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
{
unsigned long addr = (unsigned long)area->addr;
unsigned long start = ULONG_MAX, end = 0;
int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
int i;
/*
* The below block can be removed when all architectures that have
* direct map permissions also have set_direct_map_() implementations.
* This is concerned with resetting the direct map any an vm alias with
* execute permissions, without leaving a RW+X window.
*/
if (flush_reset && !IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
set_memory_nx(addr, area->nr_pages);
set_memory_rw(addr, area->nr_pages);
}
remove_vm_area(area->addr);
/* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
if (!flush_reset)
return;
/*
* If not deallocating pages, just do the flush of the VM area and
* return.
*/
if (!deallocate_pages) {
vm_unmap_aliases();
return;
}
/*
* If execution gets here, flush the vm mapping and reset the direct
* map. Find the start and end range of the direct mappings to make sure
* the vm_unmap_aliases() flush includes the direct map.
*/
for (i = 0; i < area->nr_pages; i++) {
if (page_address(area->pages[i])) {
start = min(addr, start);
end = max(addr, end);
}
}
/*
* Set direct map to something invalid so that it won't be cached if
* there are any accesses after the TLB flush, then flush the TLB and
* reset the direct map permissions to the default.
*/
set_area_direct_map(area, set_direct_map_invalid_noflush);
_vm_unmap_aliases(start, end, 1);
set_area_direct_map(area, set_direct_map_default_noflush);
}
static void __vunmap(const void *addr, int deallocate_pages) static void __vunmap(const void *addr, int deallocate_pages)
{ {
struct vm_struct *area; struct vm_struct *area;
...@@ -1526,7 +1599,8 @@ static void __vunmap(const void *addr, int deallocate_pages) ...@@ -1526,7 +1599,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
remove_vm_area(addr); vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) { if (deallocate_pages) {
int i; int i;
...@@ -1961,8 +2035,9 @@ EXPORT_SYMBOL(vzalloc_node); ...@@ -1961,8 +2035,9 @@ EXPORT_SYMBOL(vzalloc_node);
*/ */
void *vmalloc_exec(unsigned long size) void *vmalloc_exec(unsigned long size)
{ {
return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, __builtin_return_address(0)); GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
NUMA_NO_NODE, __builtin_return_address(0));
} }
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment