Commit e37e43a4 authored by Andy Lutomirski's avatar Andy Lutomirski Committed by Ingo Molnar

x86/mm/64: Enable vmapped stacks (CONFIG_HAVE_ARCH_VMAP_STACK=y)

This allows x86_64 kernels to enable vmapped stacks by setting
HAVE_ARCH_VMAP_STACK=y - which enables the CONFIG_VMAP_STACK=y
high level Kconfig option.

There are a couple of interesting bits:

First, x86 lazily faults in top-level paging entries for the vmalloc
area.  This won't work if we get a page fault while trying to access
the stack: the CPU will promote it to a double-fault and we'll die.
To avoid this problem, probe the new stack when switching stacks and
forcibly populate the pgd entry for the stack when switching mms.

Second, once we have guard pages around the stack, we'll want to
detect and handle stack overflow.

I didn't enable it on x86_32.  We'd need to rework the double-fault
code a bit and I'm concerned about running out of vmalloc virtual
addresses under some workloads.

This patch, by itself, will behave somewhat erratically when the
stack overflows while RSP is still more than a few tens of bytes
above the bottom of the stack.  Specifically, we'll get #PF and make
it to no_context and them oops without reliably triggering a
double-fault, and no_context doesn't know about stack overflows.
The next patch will improve that case.

Thank you to Nadav and Brian for helping me pay enough attention to
the SDM to hopefully get this right.
Signed-off-by: default avatarAndy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/c88f3e2920b18e6cc621d772a04a62c06869037e.1470907718.git.luto@kernel.org
[ Minor edits. ]
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent b4a0f533
...@@ -94,6 +94,7 @@ config X86 ...@@ -94,6 +94,7 @@ config X86
select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_ARCH_WITHIN_STACK_FRAMES
select HAVE_EBPF_JIT if X86_64 select HAVE_EBPF_JIT if X86_64
select HAVE_ARCH_VMAP_STACK if X86_64
select HAVE_CC_STACKPROTECTOR select HAVE_CC_STACKPROTECTOR
select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL select HAVE_CMPXCHG_LOCAL
......
...@@ -8,6 +8,28 @@ struct tss_struct; ...@@ -8,6 +8,28 @@ struct tss_struct;
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss); struct tss_struct *tss);
/* This runs runs on the previous thread's stack. */
static inline void prepare_switch_to(struct task_struct *prev,
struct task_struct *next)
{
#ifdef CONFIG_VMAP_STACK
/*
* If we switch to a stack that has a top-level paging entry
* that is not present in the current mm, the resulting #PF will
* will be promoted to a double-fault and we'll panic. Probe
* the new stack now so that vmalloc_fault can fix up the page
* tables if needed. This can only happen if we use a stack
* in vmap space.
*
* We assume that the stack is aligned so that it never spans
* more than one top-level paging entry.
*
* To minimize cache pollution, just follow the stack pointer.
*/
READ_ONCE(*(unsigned char *)next->thread.sp);
#endif
}
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
#ifdef CONFIG_CC_STACKPROTECTOR #ifdef CONFIG_CC_STACKPROTECTOR
...@@ -39,6 +61,8 @@ do { \ ...@@ -39,6 +61,8 @@ do { \
*/ \ */ \
unsigned long ebx, ecx, edx, esi, edi; \ unsigned long ebx, ecx, edx, esi, edi; \
\ \
prepare_switch_to(prev, next); \
\
asm volatile("pushl %%ebp\n\t" /* save EBP */ \ asm volatile("pushl %%ebp\n\t" /* save EBP */ \
"movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
"movl %[next_sp],%%esp\n\t" /* restore ESP */ \ "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
...@@ -103,7 +127,9 @@ do { \ ...@@ -103,7 +127,9 @@ do { \
* clean in kernel mode, with the possible exception of IOPL. Kernel IOPL * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL
* has no effect. * has no effect.
*/ */
#define switch_to(prev, next, last) \ #define switch_to(prev, next, last) \
prepare_switch_to(prev, next); \
\
asm volatile(SAVE_CONTEXT \ asm volatile(SAVE_CONTEXT \
"movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
"movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
......
...@@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) ...@@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
#ifdef CONFIG_VMAP_STACK
static void __noreturn handle_stack_overflow(const char *message,
struct pt_regs *regs,
unsigned long fault_address)
{
printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
(void *)fault_address, current->stack,
(char *)current->stack + THREAD_SIZE - 1);
die(message, regs, 0);
/* Be absolutely certain we don't return. */
panic(message);
}
#endif
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
/* Runs on IST stack */ /* Runs on IST stack */
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
{ {
static const char str[] = "double fault"; static const char str[] = "double fault";
struct task_struct *tsk = current; struct task_struct *tsk = current;
#ifdef CONFIG_VMAP_STACK
unsigned long cr2;
#endif
#ifdef CONFIG_X86_ESPFIX64 #ifdef CONFIG_X86_ESPFIX64
extern unsigned char native_irq_return_iret[]; extern unsigned char native_irq_return_iret[];
...@@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) ...@@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
tsk->thread.error_code = error_code; tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_DF; tsk->thread.trap_nr = X86_TRAP_DF;
#ifdef CONFIG_VMAP_STACK
/*
* If we overflow the stack into a guard page, the CPU will fail
* to deliver #PF and will send #DF instead. Similarly, if we
* take any non-IST exception while too close to the bottom of
* the stack, the processor will get a page fault while
* delivering the exception and will generate a double fault.
*
* According to the SDM (footnote in 6.15 under "Interrupt 14 -
* Page-Fault Exception (#PF):
*
* Processors update CR2 whenever a page fault is detected. If a
* second page fault occurs while an earlier page fault is being
* deliv- ered, the faulting linear address of the second fault will
* overwrite the contents of CR2 (replacing the previous
* address). These updates to CR2 occur even if the page fault
* results in a double fault or occurs during the delivery of a
* double fault.
*
* The logic below has a small possibility of incorrectly diagnosing
* some errors as stack overflows. For example, if the IDT or GDT
* gets corrupted such that #GP delivery fails due to a bad descriptor
* causing #GP and we hit this condition while CR2 coincidentally
* points to the stack guard page, we'll think we overflowed the
* stack. Given that we're going to panic one way or another
* if this happens, this isn't necessarily worth fixing.
*
* If necessary, we could improve the test by only diagnosing
* a stack overflow if the saved RSP points within 47 bytes of
* the bottom of the stack: if RSP == tsk_stack + 48 and we
* take an exception, the stack is already aligned and there
* will be enough room SS, RSP, RFLAGS, CS, RIP, and a
* possible error code, so a stack overflow would *not* double
* fault. With any less space left, exception delivery could
* fail, and, as a practical matter, we've overflowed the
* stack even if the actual trigger for the double fault was
* something else.
*/
cr2 = read_cr2();
if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
#endif
#ifdef CONFIG_DOUBLEFAULT #ifdef CONFIG_DOUBLEFAULT
df_debug(regs, error_code); df_debug(regs, error_code);
#endif #endif
......
...@@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, ...@@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
unsigned cpu = smp_processor_id(); unsigned cpu = smp_processor_id();
if (likely(prev != next)) { if (likely(prev != next)) {
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
* If our current stack is in vmalloc space and isn't
* mapped in the new pgd, we'll double-fault. Forcibly
* map it.
*/
unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
pgd_t *pgd = next->pgd + stack_pgd_index;
if (unlikely(pgd_none(*pgd)))
set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
}
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
this_cpu_write(cpu_tlbstate.active_mm, next); this_cpu_write(cpu_tlbstate.active_mm, next);
#endif #endif
cpumask_set_cpu(cpu, mm_cpumask(next)); cpumask_set_cpu(cpu, mm_cpumask(next));
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment