Commit 99792e0c authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "Lots of changes in this cycle:

   - Lots of CPA (change page attribute) optimizations and related
     cleanups (Thomas Gleixner, Peter Zijstra)

   - Make lazy TLB mode even lazier (Rik van Riel)

   - Fault handler cleanups and improvements (Dave Hansen)

   - kdump, vmcore: Enable kdumping encrypted memory with AMD SME
     enabled (Lianbo Jiang)

   - Clean up VM layout documentation (Baoquan He, Ingo Molnar)

   - ... plus misc other fixes and enhancements"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits)
  x86/stackprotector: Remove the call to boot_init_stack_canary() from cpu_startup_entry()
  x86/mm: Kill stray kernel fault handling comment
  x86/mm: Do not warn about PCI BIOS W+X mappings
  resource: Clean it up a bit
  resource: Fix find_next_iomem_res() iteration issue
  resource: Include resource end in walk_*() interfaces
  x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error
  x86/mm: Remove spurious fault pkey check
  x86/mm/vsyscall: Consider vsyscall page part of user address space
  x86/mm: Add vsyscall address helper
  x86/mm: Fix exception table comments
  x86/mm: Add clarifying comments for user addr space
  x86/mm: Break out user address space handling
  x86/mm: Break out kernel address space handling
  x86/mm: Clarify hardware vs. software "error_code"
  x86/mm/tlb: Make lazy TLB mode lazier
  x86/mm/tlb: Add freed_tables element to flush_tlb_info
  x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range
  smp,cpumask: introduce on_each_cpu_cond_mask
  smp: use __cpumask_set_cpu in on_each_cpu_cond
  ...
parents 382d72a9 977e4be5
This diff is collapsed.
......@@ -1487,6 +1487,14 @@ config X86_DIRECT_GBPAGES
supports them), so don't confuse the user by printing
that we have them enabled.
config X86_CPA_STATISTICS
bool "Enable statistic for Change Page Attribute"
depends on DEBUG_FS
---help---
Expose statistics about the Change Page Attribute mechanims, which
helps to determine the effectivness of preserving large and huge
page mappings when mapping protections are changed.
config ARCH_HAS_MEM_ENCRYPT
def_bool y
......
......@@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size)
#define ioremap_nocache ioremap_nocache
extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
#define ioremap_uc ioremap_uc
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
#define ioremap_cache ioremap_cache
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
#define ioremap_prot ioremap_prot
extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size);
#define ioremap_encrypted ioremap_encrypted
/**
* ioremap - map bus memory into CPU space
......
......@@ -67,7 +67,7 @@ struct kimage;
/* Memory to backup during crash kdump */
#define KEXEC_BACKUP_SRC_START (0UL)
#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */
#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */
/*
* CPU does not save ss and sp on stack if execution is already
......
......@@ -59,13 +59,16 @@
#endif
/*
* Kernel image size is limited to 1GiB due to the fixmap living in the
* next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use
* 512MiB by default, leaving 1.5GiB for modules once the page tables
* are fully set up. If kernel ASLR is configured, it can extend the
* kernel page table mapping, reducing the size of the modules area.
* Maximum kernel image size is limited to 1 GiB, due to the fixmap living
* in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S).
*
* On KASLR use 1 GiB by default, leaving 1 GiB for modules once the
* page tables are fully set up.
*
* If KASLR is disabled we can shrink it to 0.5 GiB and increase the size
* of the modules area to 1.5 GiB.
*/
#if defined(CONFIG_RANDOMIZE_BASE)
#ifdef CONFIG_RANDOMIZE_BASE
#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
#else
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
......
......@@ -6,16 +6,23 @@
#define tlb_end_vma(tlb, vma) do { } while (0)
#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
#define tlb_flush(tlb) \
{ \
if (!tlb->fullmm && !tlb->need_flush_all) \
flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \
else \
flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \
}
static inline void tlb_flush(struct mmu_gather *tlb);
#include <asm-generic/tlb.h>
static inline void tlb_flush(struct mmu_gather *tlb)
{
unsigned long start = 0UL, end = TLB_FLUSH_ALL;
unsigned int stride_shift = tlb_get_unmap_shift(tlb);
if (!tlb->fullmm && !tlb->need_flush_all) {
start = tlb->start;
end = tlb->end;
}
flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
}
/*
* While x86 architecture in general requires an IPI to perform TLB
* shootdown, enablement code for several hypervisors overrides
......
......@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
#endif
static inline bool tlb_defer_switch_to_init_mm(void)
{
/*
* If we have PCID, then switching to init_mm is reasonably
* fast. If we don't have PCID, then switching to init_mm is
* quite slow, so we try to defer it in the hopes that we can
* avoid it entirely. The latter approach runs the risk of
* receiving otherwise unnecessary IPIs.
*
* This choice is just a heuristic. The tlb code can handle this
* function returning true or false regardless of whether we have
* PCID.
*/
return !static_cpu_has(X86_FEATURE_PCID);
}
struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
......@@ -547,23 +531,30 @@ struct flush_tlb_info {
unsigned long start;
unsigned long end;
u64 new_tlb_gen;
unsigned int stride_shift;
bool freed_tables;
};
#define local_flush_tlb() __flush_tlb()
#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
#define flush_tlb_mm(mm) \
flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
#define flush_tlb_range(vma, start, end) \
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
#define flush_tlb_range(vma, start, end) \
flush_tlb_mm_range((vma)->vm_mm, start, end, \
((vma)->vm_flags & VM_HUGETLB) \
? huge_page_shift(hstate_vma(vma)) \
: PAGE_SHIFT, false)
extern void flush_tlb_all(void);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag);
unsigned long end, unsigned int stride_shift,
bool freed_tables);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
{
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
......
......@@ -11,40 +11,62 @@
#include <linux/uaccess.h>
#include <linux/io.h>
/**
* copy_oldmem_page - copy one page from "oldmem"
* @pfn: page frame number to be copied
* @buf: target memory address for the copy; this can be in kernel address
* space or user address space (see @userbuf)
* @csize: number of bytes to copy
* @offset: offset in bytes into the page (based on pfn) to begin the copy
* @userbuf: if set, @buf is in user address space, use copy_to_user(),
* otherwise @buf is in kernel address space, use memcpy().
*
* Copy a page from "oldmem". For this page, there is no pte mapped
* in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
size_t csize, unsigned long offset, int userbuf)
static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf,
bool encrypted)
{
void *vaddr;
if (!csize)
return 0;
vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
if (encrypted)
vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
else
vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
if (!vaddr)
return -ENOMEM;
if (userbuf) {
if (copy_to_user(buf, vaddr + offset, csize)) {
iounmap(vaddr);
if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
iounmap((void __iomem *)vaddr);
return -EFAULT;
}
} else
memcpy(buf, vaddr + offset, csize);
set_iounmap_nonlazy();
iounmap(vaddr);
iounmap((void __iomem *)vaddr);
return csize;
}
/**
* copy_oldmem_page - copy one page of memory
* @pfn: page frame number to be copied
* @buf: target memory address for the copy; this can be in kernel address
* space or user address space (see @userbuf)
* @csize: number of bytes to copy
* @offset: offset in bytes into the page (based on pfn) to begin the copy
* @userbuf: if set, @buf is in user address space, use copy_to_user(),
* otherwise @buf is in kernel address space, use memcpy().
*
* Copy a page from the old kernel's memory. For this page, there is no pte
* mapped in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf)
{
return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false);
}
/**
* copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the
* memory with the encryption mask set to accomodate kdump on SME-enabled
* machines.
*/
ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf)
{
return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
}
......@@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
map_ldt_struct_to_user(mm);
va = (unsigned long)ldt_slot_va(slot);
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false);
ldt->slot = slot;
return 0;
......
......@@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pte_unmap_unlock(pte, ptl);
out:
up_write(&mm->mmap_sem);
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL);
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
}
......
......@@ -19,7 +19,9 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/highmem.h>
#include <linux/pci.h>
#include <asm/e820/types.h>
#include <asm/pgtable.h>
/*
......@@ -241,6 +243,29 @@ static unsigned long normalize_addr(unsigned long u)
return (signed long)(u << shift) >> shift;
}
static void note_wx(struct pg_state *st)
{
unsigned long npages;
npages = (st->current_address - st->start_address) / PAGE_SIZE;
#ifdef CONFIG_PCI_BIOS
/*
* If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
* Inform about it, but avoid the warning.
*/
if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
st->current_address <= PAGE_OFFSET + BIOS_END) {
pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
return;
}
#endif
/* Account the WX pages */
st->wx_pages += npages;
WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n",
(void *)st->start_address);
}
/*
* This function gets called on a break in a continuous series
* of PTE entries; the next one is different so we need to
......@@ -276,14 +301,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
unsigned long delta;
int width = sizeof(unsigned long) * 2;
if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) {
WARN_ONCE(1,
"x86/mm: Found insecure W+X mapping at address %p/%pS\n",
(void *)st->start_address,
(void *)st->start_address);
st->wx_pages += (st->current_address -
st->start_address) / PAGE_SIZE;
}
if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
note_wx(st);
/*
* Now print the actual finished series
......
This diff is collapsed.
......@@ -923,34 +923,19 @@ static void mark_nxdata_nx(void)
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
unsigned long size = PFN_ALIGN(_etext) - start;
unsigned long size = (unsigned long)__end_rodata - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel text: %luk\n",
pr_info("Write protecting kernel text and read-only data: %luk\n",
size >> 10);
kernel_set_to_readonly = 1;
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
start, start+size);
set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
printk(KERN_INFO "Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
#endif
start += size;
size = (unsigned long)__end_rodata - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
size >> 10);
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size);
set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Testing CPA: write protecting again\n");
pr_info("Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
#endif
mark_nxdata_nx();
......
......@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
* caller shouldn't need to know that small detail.
*/
static void __iomem *__ioremap_caller(resource_size_t phys_addr,
unsigned long size, enum page_cache_mode pcm, void *caller)
unsigned long size, enum page_cache_mode pcm,
void *caller, bool encrypted)
{
unsigned long offset, vaddr;
resource_size_t last_addr;
......@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
* resulting mapping.
*/
prot = PAGE_KERNEL_IO;
if (sev_active() && mem_flags.desc_other)
if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
switch (pcm) {
......@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
return __ioremap_caller(phys_addr, size, pcm,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_nocache);
......@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
return __ioremap_caller(phys_addr, size, pcm,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL_GPL(ioremap_uc);
......@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_wc);
......@@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc);
void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_wt);
void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
__builtin_return_address(0), true);
}
EXPORT_SYMBOL(ioremap_encrypted);
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_cache);
......@@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
{
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_prot);
......
This diff is collapsed.
......@@ -185,8 +185,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
bool need_flush;
u16 new_asid;
/*
* NB: The scheduler will call us with prev == next when switching
......@@ -240,20 +243,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
next->context.ctx_id);
/*
* We don't currently support having a real mm loaded without
* our cpu set in mm_cpumask(). We have all the bookkeeping
* in place to figure out whether we would need to flush
* if our cpu were cleared in mm_cpumask(), but we don't
* currently use it.
* Even in lazy TLB mode, the CPU should stay set in the
* mm_cpumask. The TLB shootdown code can figure out from
* from cpu_tlbstate.is_lazy whether or not to send an IPI.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
return;
/*
* If the CPU is not in lazy TLB mode, we are just switching
* from one thread in a process to another thread in the same
* process. No TLB flush required.
*/
if (!was_lazy)
return;
/*
* Read the tlb_gen to check whether a flush is needed.
* If the TLB is up to date, just use it.
* The barrier synchronizes with the tlb_gen increment in
* the TLB shootdown code.
*/
smp_mb();
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
next_tlb_gen)
return;
/*
* TLB contents went out of date while we were in lazy
* mode. Fall through to the TLB switching code below.
*/
new_asid = prev_asid;
need_flush = true;
} else {
u16 new_asid;
bool need_flush;
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
/*
......@@ -308,46 +332,48 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
/* Let nmi_uaccess_okay() know that we're changing CR3. */
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
barrier();
}
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, true);
/*
* NB: This gets called via leave_mm() in the idle path
* where RCU functions differently. Tracing normally
* uses RCU, so we need to use the _rcuidle variant.
*
* (There is no good reason for this. The idle code should
* be rearranged to call this before rcu_idle_enter().)
*/
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, new_asid, false);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, true);
/*
* Record last user mm's context id, so we can avoid
* flushing branch buffer with IBPB if we switch back
* to the same user.
* NB: This gets called via leave_mm() in the idle path
* where RCU functions differently. Tracing normally
* uses RCU, so we need to use the _rcuidle variant.
*
* (There is no good reason for this. The idle code should
* be rearranged to call this before rcu_idle_enter().)
*/
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
/* Make sure we write CR3 before loaded_mm. */
barrier();
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, new_asid, false);
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
load_mm_cr4(next);
switch_ldt(real_prev, next);
/*
* Record last user mm's context id, so we can avoid
* flushing branch buffer with IBPB if we switch back
* to the same user.
*/
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
/* Make sure we write CR3 before loaded_mm. */
barrier();
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
if (next != real_prev) {
load_mm_cr4(next);
switch_ldt(real_prev, next);
}
}
/*
......@@ -368,20 +394,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
if (tlb_defer_switch_to_init_mm()) {
/*
* There's a significant optimization that may be possible
* here. We have accurate enough TLB flush tracking that we
* don't need to maintain coherence of TLB per se when we're
* lazy. We do, however, need to maintain coherence of
* paging-structure caches. We could, in principle, leave our
* old mm loaded and only switch to init_mm when
* tlb_remove_page() happens.
*/
this_cpu_write(cpu_tlbstate.is_lazy, true);
} else {
switch_mm(NULL, &init_mm, NULL);
}
this_cpu_write(cpu_tlbstate.is_lazy, true);
}
/*
......@@ -468,6 +481,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
*
* This should be rare, with native_flush_tlb_others skipping
* IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
......@@ -528,17 +544,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) {
/* Partial flush */
unsigned long addr;
unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
unsigned long addr = f->start;
addr = f->start;
while (addr < f->end) {
__flush_tlb_one_user(addr);
addr += PAGE_SIZE;
addr += 1UL << f->stride_shift;
}
if (local)
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
trace_tlb_flush(reason, nr_pages);
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
trace_tlb_flush(reason, nr_invalidate);
} else {
/* Full flush. */
local_flush_tlb();
......@@ -571,6 +586,11 @@ static void flush_tlb_func_remote(void *info)
flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
}
static bool tlb_is_not_lazy(int cpu, void *data)
{
return !per_cpu(cpu_tlbstate.is_lazy, cpu);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
......@@ -606,8 +626,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(void *)info, 1);
return;
}
smp_call_function_many(cpumask, flush_tlb_func_remote,
/*
* If no page tables were freed, we can skip sending IPIs to
* CPUs in lazy TLB mode. They will flush the CPU themselves
* at the next context switch.
*
* However, if page tables are getting freed, we need to send the
* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
if (info->freed_tables)
smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
else
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
(void *)info, 1, GFP_ATOMIC, cpumask);
}
/*
......@@ -623,12 +658,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
unsigned long end, unsigned int stride_shift,
bool freed_tables)
{
int cpu;
struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
.mm = mm,
.stride_shift = stride_shift,
.freed_tables = freed_tables,
};
cpu = get_cpu();
......@@ -638,8 +676,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
/* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) &&
!(vmflag & VM_HUGETLB) &&
((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
info.start = start;
info.end = end;
} else {
......
......@@ -22,6 +22,7 @@
#include <linux/tick.h>
#include <linux/nmi.h>
#include <linux/cpuhotplug.h>
#include <linux/stackprotector.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
......@@ -88,6 +89,7 @@ static void cpu_bringup(void)
asmlinkage __visible void cpu_bringup_and_idle(void)
{
cpu_bringup();
boot_init_stack_canary();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
......
......@@ -902,12 +902,22 @@ static bool copy_device_table(void)
}
}
old_devtb_phys = entry & PAGE_MASK;
/*
* When SME is enabled in the first kernel, the entry includes the
* memory encryption mask(sme_me_mask), we must remove the memory
* encryption mask to obtain the true physical address in kdump kernel.
*/
old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
if (old_devtb_phys >= 0x100000000ULL) {
pr_err("The address of old device table is above 4G, not trustworthy!\n");
return false;
}
old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
old_devtb = (sme_active() && is_kdump_kernel())
? (__force void *)ioremap_encrypted(old_devtb_phys,
dev_table_size)
: memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
if (!old_devtb)
return false;
......
......@@ -24,6 +24,8 @@
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/uaccess.h>
#include <linux/mem_encrypt.h>
#include <asm/pgtable.h>
#include <asm/io.h>
#include "internal.h"
......@@ -98,7 +100,8 @@ static int pfn_is_ram(unsigned long pfn)
/* Reads a page from the oldmem device from given offset. */
static ssize_t read_from_oldmem(char *buf, size_t count,
u64 *ppos, int userbuf)
u64 *ppos, int userbuf,
bool encrypted)
{
unsigned long pfn, offset;
size_t nr_bytes;
......@@ -120,8 +123,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
if (pfn_is_ram(pfn) == 0)
memset(buf, 0, nr_bytes);
else {
tmp = copy_oldmem_page(pfn, buf, nr_bytes,
offset, userbuf);
if (encrypted)
tmp = copy_oldmem_page_encrypted(pfn, buf,
nr_bytes,
offset,
userbuf);
else
tmp = copy_oldmem_page(pfn, buf, nr_bytes,
offset, userbuf);
if (tmp < 0)
return tmp;
}
......@@ -155,7 +165,7 @@ void __weak elfcorehdr_free(unsigned long long addr)
*/
ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
{
return read_from_oldmem(buf, count, ppos, 0);
return read_from_oldmem(buf, count, ppos, 0, false);
}
/*
......@@ -163,7 +173,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
*/
ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
{
return read_from_oldmem(buf, count, ppos, 0);
return read_from_oldmem(buf, count, ppos, 0, sme_active());
}
/*
......@@ -173,9 +183,20 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
prot = pgprot_encrypted(prot);
return remap_pfn_range(vma, from, pfn, size, prot);
}
/*
* Architectures which support memory encryption override this.
*/
ssize_t __weak
copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf)
{
return copy_oldmem_page(pfn, buf, csize, offset, userbuf);
}
/*
* Copy to either kernel or user space
*/
......@@ -351,7 +372,8 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
m->offset + m->size - *fpos,
buflen);
start = m->paddr + *fpos - m->offset;
tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
tmp = read_from_oldmem(buffer, tsz, &start,
userbuf, sme_active());
if (tmp < 0)
return tmp;
buflen -= tsz;
......
......@@ -26,6 +26,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma,
extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
unsigned long, int);
extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
size_t csize, unsigned long offset,
int userbuf);
void vmcore_cleanup(void);
/* Architecture code defines this if there are other possible ELF
......
......@@ -53,6 +53,10 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags);
void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags, const struct cpumask *mask);
int smp_call_function_single_async(int cpu, call_single_data_t *csd);
#ifdef CONFIG_SMP
......
......@@ -471,6 +471,10 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
}
}
/* Ensure that these pages are decrypted if SME is enabled. */
if (pages)
arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
return pages;
}
......@@ -867,6 +871,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = -ENOMEM;
goto out;
}
arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
......@@ -884,6 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
goto out;
......
......@@ -318,33 +318,34 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
/*
* Finds the lowest iomem resource existing within [res->start.res->end).
* The caller must specify res->start, res->end, res->flags, and optionally
* desc. If found, returns 0, res is overwritten, if not found, returns -1.
* This function walks the whole tree and not just first level children until
* and unless first_level_children_only is true.
/**
* Finds the lowest iomem resource that covers part of [start..end]. The
* caller must specify start, end, flags, and desc (which may be
* IORES_DESC_NONE).
*
* If a resource is found, returns 0 and *res is overwritten with the part
* of the resource that's within [start..end]; if none is found, returns
* -1.
*
* This function walks the whole tree and not just first level children
* unless @first_lvl is true.
*/
static int find_next_iomem_res(struct resource *res, unsigned long desc,
bool first_level_children_only)
static int find_next_iomem_res(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
bool first_lvl, struct resource *res)
{
resource_size_t start, end;
struct resource *p;
bool sibling_only = false;
BUG_ON(!res);
start = res->start;
end = res->end;
BUG_ON(start >= end);
if (!res)
return -EINVAL;
if (first_level_children_only)
sibling_only = true;
if (start >= end)
return -EINVAL;
read_lock(&resource_lock);
for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
if ((p->flags & res->flags) != res->flags)
for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) {
if ((p->flags & flags) != flags)
continue;
if ((desc != IORES_DESC_NONE) && (desc != p->desc))
continue;
......@@ -352,45 +353,43 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
p = NULL;
break;
}
if ((p->end >= start) && (p->start < end))
if ((p->end >= start) && (p->start <= end))
break;
}
read_unlock(&resource_lock);
if (!p)
return -1;
/* copy data */
if (res->start < p->start)
res->start = p->start;
if (res->end > p->end)
res->end = p->end;
res->start = max(start, p->start);
res->end = min(end, p->end);
res->flags = p->flags;
res->desc = p->desc;
return 0;
}
static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
bool first_level_children_only,
void *arg,
static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
bool first_lvl, void *arg,
int (*func)(struct resource *, void *))
{
u64 orig_end = res->end;
struct resource res;
int ret = -1;
while ((res->start < res->end) &&
!find_next_iomem_res(res, desc, first_level_children_only)) {
ret = (*func)(res, arg);
while (start < end &&
!find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
ret = (*func)(&res, arg);
if (ret)
break;
res->start = res->end + 1;
res->end = orig_end;
start = res.end + 1;
}
return ret;
}
/*
/**
* Walks through iomem resources and calls func() with matching resource
* ranges. This walks through whole tree and not just first level children.
* All the memory ranges which overlap start,end and also match flags and
......@@ -407,13 +406,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
u64 end, void *arg, int (*func)(struct resource *, void *))
{
struct resource res;
res.start = start;
res.end = end;
res.flags = flags;
return __walk_iomem_res_desc(&res, desc, false, arg, func);
return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func);
}
EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
......@@ -425,15 +418,11 @@ EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
* ranges.
*/
int walk_system_ram_res(u64 start, u64 end, void *arg,
int (*func)(struct resource *, void *))
int (*func)(struct resource *, void *))
{
struct resource res;
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
res.start = start;
res.end = end;
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
arg, func);
}
......@@ -444,13 +433,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
int walk_mem_res(u64 start, u64 end, void *arg,
int (*func)(struct resource *, void *))
{
struct resource res;
res.start = start;
res.end = end;
res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
arg, func);
}
......@@ -462,27 +447,27 @@ int walk_mem_res(u64 start, u64 end, void *arg,
* It is to be used only for System RAM.
*/
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg, int (*func)(unsigned long, unsigned long, void *))
void *arg, int (*func)(unsigned long, unsigned long, void *))
{
resource_size_t start, end;
unsigned long flags;
struct resource res;
unsigned long pfn, end_pfn;
u64 orig_end;
int ret = -1;
res.start = (u64) start_pfn << PAGE_SHIFT;
res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
orig_end = res.end;
while ((res.start < res.end) &&
(find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
start = (u64) start_pfn << PAGE_SHIFT;
end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
while (start < end &&
!find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
true, &res)) {
pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
end_pfn = (res.end + 1) >> PAGE_SHIFT;
if (end_pfn > pfn)
ret = (*func)(pfn, end_pfn - pfn, arg);
if (ret)
break;
res.start = res.end + 1;
res.end = orig_end;
start = res.end + 1;
}
return ret;
}
......@@ -658,8 +643,8 @@ static int find_resource(struct resource *root, struct resource *new,
* @constraint: the size and alignment constraints to be met.
*/
static int reallocate_resource(struct resource *root, struct resource *old,
resource_size_t newsize,
struct resource_constraint *constraint)
resource_size_t newsize,
struct resource_constraint *constraint)
{
int err=0;
struct resource new = *old;
......@@ -972,7 +957,7 @@ static int __adjust_resource(struct resource *res, resource_size_t start,
* Existing children of the resource are assumed to be immutable.
*/
int adjust_resource(struct resource *res, resource_size_t start,
resource_size_t size)
resource_size_t size)
{
int result;
......@@ -983,9 +968,9 @@ int adjust_resource(struct resource *res, resource_size_t start,
}
EXPORT_SYMBOL(adjust_resource);
static void __init __reserve_region_with_split(struct resource *root,
resource_size_t start, resource_size_t end,
const char *name)
static void __init
__reserve_region_with_split(struct resource *root, resource_size_t start,
resource_size_t end, const char *name)
{
struct resource *parent = root;
struct resource *conflict;
......@@ -1044,9 +1029,9 @@ static void __init __reserve_region_with_split(struct resource *root,
}
void __init reserve_region_with_split(struct resource *root,
resource_size_t start, resource_size_t end,
const char *name)
void __init
reserve_region_with_split(struct resource *root, resource_size_t start,
resource_size_t end, const char *name)
{
int abort = 0;
......@@ -1172,7 +1157,7 @@ EXPORT_SYMBOL(__request_region);
* The described resource region must match a currently busy region.
*/
void __release_region(struct resource *parent, resource_size_t start,
resource_size_t n)
resource_size_t n)
{
struct resource **p;
resource_size_t end;
......@@ -1234,7 +1219,7 @@ EXPORT_SYMBOL(__release_region);
* simplicity. Enhance this logic when necessary.
*/
int release_mem_region_adjustable(struct resource *parent,
resource_size_t start, resource_size_t size)
resource_size_t start, resource_size_t size)
{
struct resource **p;
struct resource *res;
......@@ -1410,9 +1395,9 @@ static int devm_region_match(struct device *dev, void *res, void *match_data)
this->start == match->start && this->n == match->n;
}
struct resource * __devm_request_region(struct device *dev,
struct resource *parent, resource_size_t start,
resource_size_t n, const char *name)
struct resource *
__devm_request_region(struct device *dev, struct resource *parent,
resource_size_t start, resource_size_t n, const char *name)
{
struct region_devres *dr = NULL;
struct resource *res;
......
......@@ -347,21 +347,6 @@ EXPORT_SYMBOL_GPL(play_idle);
void cpu_startup_entry(enum cpuhp_state state)
{
/*
* This #ifdef needs to die, but it's too late in the cycle to
* make this generic (ARM and SH have never invoked the canary
* init for the non boot CPUs!). Will be fixed in 3.11
*/
#ifdef CONFIG_X86
/*
* If we're the non-boot CPU, nothing set the stack canary up
* for us. The boot CPU already has it initialized but no harm
* in doing it again. This is a good place for updating it, as
* we wont ever return from this function (so the invalid
* canaries already on the stack wont ever trigger).
*/
boot_init_stack_canary();
#endif
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
while (1)
......
......@@ -56,7 +56,6 @@
#include <linux/profile.h>
#include <linux/rcupdate_wait.h>
#include <linux/security.h>
#include <linux/stackprotector.h>
#include <linux/stop_machine.h>
#include <linux/suspend.h>
#include <linux/swait.h>
......
......@@ -669,9 +669,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* You must not call this function with disabled interrupts or
* from a hardware interrupt handler or from a bottom half handler.
*/
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags)
gfp_t gfp_flags, const struct cpumask *mask)
{
cpumask_var_t cpus;
int cpu, ret;
......@@ -680,9 +680,9 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
preempt_disable();
for_each_online_cpu(cpu)
for_each_cpu(cpu, mask)
if (cond_func(cpu, info))
cpumask_set_cpu(cpu, cpus);
__cpumask_set_cpu(cpu, cpus);
on_each_cpu_mask(cpus, func, info, wait);
preempt_enable();
free_cpumask_var(cpus);
......@@ -692,7 +692,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
* just have to IPI them one by one.
*/
preempt_disable();
for_each_online_cpu(cpu)
for_each_cpu(cpu, mask)
if (cond_func(cpu, info)) {
ret = smp_call_function_single(cpu, func,
info, wait);
......@@ -701,6 +701,15 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
preempt_enable();
}
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags)
{
on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags,
cpu_online_mask);
}
EXPORT_SYMBOL(on_each_cpu_cond);
static void do_nothing(void *unused)
......
......@@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* Preemption is disabled here to make sure the cond_func is called under the
* same condtions in UP and SMP.
*/
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags)
void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags, const struct cpumask *mask)
{
unsigned long flags;
......@@ -82,6 +82,14 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
}
preempt_enable();
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags)
{
on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL);
}
EXPORT_SYMBOL(on_each_cpu_cond);
int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
......
......@@ -8,6 +8,7 @@
*/
#include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <asm/tlb.h>
#include <asm-generic/pgtable.h>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment