Commit 03f70388 authored by Ingo Molnar's avatar Ingo Molnar

Merge branch 'tip/x86/core-3' of...

Merge branch 'tip/x86/core-3' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace into perf/core
parents 9e183426 42181186
...@@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump); ...@@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump);
extern void hw_breakpoint_restore(void); extern void hw_breakpoint_restore(void);
#ifdef CONFIG_X86_64
DECLARE_PER_CPU(int, debug_stack_usage);
static inline void debug_stack_usage_inc(void)
{
__get_cpu_var(debug_stack_usage)++;
}
static inline void debug_stack_usage_dec(void)
{
__get_cpu_var(debug_stack_usage)--;
}
int is_debug_stack(unsigned long addr);
void debug_stack_set_zero(void);
void debug_stack_reset(void);
#else /* !X86_64 */
static inline int is_debug_stack(unsigned long addr) { return 0; }
static inline void debug_stack_set_zero(void) { }
static inline void debug_stack_reset(void) { }
static inline void debug_stack_usage_inc(void) { }
static inline void debug_stack_usage_dec(void) { }
#endif /* X86_64 */
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
#endif /* _ASM_X86_DEBUGREG_H */ #endif /* _ASM_X86_DEBUGREG_H */
...@@ -35,6 +35,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in ...@@ -35,6 +35,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
extern struct desc_ptr idt_descr; extern struct desc_ptr idt_descr;
extern gate_desc idt_table[]; extern gate_desc idt_table[];
extern struct desc_ptr nmi_idt_descr;
extern gate_desc nmi_idt_table[];
struct gdt_page { struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES]; struct desc_struct gdt[GDT_ENTRIES];
...@@ -307,6 +309,16 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) ...@@ -307,6 +309,16 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
desc->limit = (limit >> 16) & 0xf; desc->limit = (limit >> 16) & 0xf;
} }
#ifdef CONFIG_X86_64
static inline void set_nmi_gate(int gate, void *addr)
{
gate_desc s;
pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
write_idt_entry(nmi_idt_table, gate, &s);
}
#endif
static inline void _set_gate(int gate, unsigned type, void *addr, static inline void _set_gate(int gate, unsigned type, void *addr,
unsigned dpl, unsigned ist, unsigned seg) unsigned dpl, unsigned ist, unsigned seg)
{ {
......
...@@ -1026,6 +1026,8 @@ __setup("clearcpuid=", setup_disablecpuid); ...@@ -1026,6 +1026,8 @@ __setup("clearcpuid=", setup_disablecpuid);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1,
(unsigned long) nmi_idt_table };
DEFINE_PER_CPU_FIRST(union irq_stack_union, DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE); irq_stack_union) __aligned(PAGE_SIZE);
...@@ -1090,6 +1092,26 @@ unsigned long kernel_eflags; ...@@ -1090,6 +1092,26 @@ unsigned long kernel_eflags;
*/ */
DEFINE_PER_CPU(struct orig_ist, orig_ist); DEFINE_PER_CPU(struct orig_ist, orig_ist);
static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
DEFINE_PER_CPU(int, debug_stack_usage);
int is_debug_stack(unsigned long addr)
{
return __get_cpu_var(debug_stack_usage) ||
(addr <= __get_cpu_var(debug_stack_addr) &&
addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
}
void debug_stack_set_zero(void)
{
load_idt((const struct desc_ptr *)&nmi_idt_descr);
}
void debug_stack_reset(void)
{
load_idt((const struct desc_ptr *)&idt_descr);
}
#else /* CONFIG_X86_64 */ #else /* CONFIG_X86_64 */
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
...@@ -1208,6 +1230,8 @@ void __cpuinit cpu_init(void) ...@@ -1208,6 +1230,8 @@ void __cpuinit cpu_init(void)
estacks += exception_stack_sizes[v]; estacks += exception_stack_sizes[v];
oist->ist[v] = t->x86_tss.ist[v] = oist->ist[v] = t->x86_tss.ist[v] =
(unsigned long)estacks; (unsigned long)estacks;
if (v == DEBUG_STACK-1)
per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
} }
} }
......
...@@ -1475,62 +1475,214 @@ ENTRY(error_exit) ...@@ -1475,62 +1475,214 @@ ENTRY(error_exit)
CFI_ENDPROC CFI_ENDPROC
END(error_exit) END(error_exit)
/*
* Test if a given stack is an NMI stack or not.
*/
.macro test_in_nmi reg stack nmi_ret normal_ret
cmpq %\reg, \stack
ja \normal_ret
subq $EXCEPTION_STKSZ, %\reg
cmpq %\reg, \stack
jb \normal_ret
jmp \nmi_ret
.endm
/* runs on exception stack */ /* runs on exception stack */
ENTRY(nmi) ENTRY(nmi)
INTR_FRAME INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq_cfi $-1 /*
* We allow breakpoints in NMIs. If a breakpoint occurs, then
* the iretq it performs will take us out of NMI context.
* This means that we can have nested NMIs where the next
* NMI is using the top of the stack of the previous NMI. We
* can't let it execute because the nested NMI will corrupt the
* stack of the previous NMI. NMI handlers are not re-entrant
* anyway.
*
* To handle this case we do the following:
* Check the a special location on the stack that contains
* a variable that is set when NMIs are executing.
* The interrupted task's stack is also checked to see if it
* is an NMI stack.
* If the variable is not set and the stack is not the NMI
* stack then:
* o Set the special variable on the stack
* o Copy the interrupt frame into a "saved" location on the stack
* o Copy the interrupt frame into a "copy" location on the stack
* o Continue processing the NMI
* If the variable is set or the previous stack is the NMI stack:
* o Modify the "copy" location to jump to the repeate_nmi
* o return back to the first NMI
*
* Now on exit of the first NMI, we first clear the stack variable
* The NMI stack will tell any nested NMIs at that point that it is
* nested. Then we pop the stack normally with iret, and if there was
* a nested NMI that updated the copy interrupt stack frame, a
* jump will be made to the repeat_nmi code that will handle the second
* NMI.
*/
/* Use %rdx as out temp variable throughout */
pushq_cfi %rdx
/*
* Check the special variable on the stack to see if NMIs are
* executing.
*/
cmp $1, -8(%rsp)
je nested_nmi
/*
* Now test if the previous stack was an NMI stack.
* We need the double check. We check the NMI stack to satisfy the
* race when the first NMI clears the variable before returning.
* We check the variable because the first NMI could be in a
* breakpoint routine using a breakpoint stack.
*/
lea 6*8(%rsp), %rdx
test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
nested_nmi:
/*
* Do nothing if we interrupted the fixup in repeat_nmi.
* It's about to repeat the NMI handler, so we are fine
* with ignoring this one.
*/
movq $repeat_nmi, %rdx
cmpq 8(%rsp), %rdx
ja 1f
movq $end_repeat_nmi, %rdx
cmpq 8(%rsp), %rdx
ja nested_nmi_out
1:
/* Set up the interrupted NMIs stack to jump to repeat_nmi */
leaq -6*8(%rsp), %rdx
movq %rdx, %rsp
CFI_ADJUST_CFA_OFFSET 6*8
pushq_cfi $__KERNEL_DS
pushq_cfi %rdx
pushfq_cfi
pushq_cfi $__KERNEL_CS
pushq_cfi $repeat_nmi
/* Put stack back */
addq $(11*8), %rsp
CFI_ADJUST_CFA_OFFSET -11*8
nested_nmi_out:
popq_cfi %rdx
/* No need to check faults here */
INTERRUPT_RETURN
first_nmi:
/*
* Because nested NMIs will use the pushed location that we
* stored in rdx, we must keep that space available.
* Here's what our stack frame will look like:
* +-------------------------+
* | original SS |
* | original Return RSP |
* | original RFLAGS |
* | original CS |
* | original RIP |
* +-------------------------+
* | temp storage for rdx |
* +-------------------------+
* | NMI executing variable |
* +-------------------------+
* | Saved SS |
* | Saved Return RSP |
* | Saved RFLAGS |
* | Saved CS |
* | Saved RIP |
* +-------------------------+
* | copied SS |
* | copied Return RSP |
* | copied RFLAGS |
* | copied CS |
* | copied RIP |
* +-------------------------+
* | pt_regs |
* +-------------------------+
*
* The saved RIP is used to fix up the copied RIP that a nested
* NMI may zero out. The original stack frame and the temp storage
* is also used by nested NMIs and can not be trusted on exit.
*/
/* Set the NMI executing variable on the stack. */
pushq_cfi $1
/* Copy the stack frame to the Saved frame */
.rept 5
pushq_cfi 6*8(%rsp)
.endr
/* Make another copy, this one may be modified by nested NMIs */
.rept 5
pushq_cfi 4*8(%rsp)
.endr
/* Do not pop rdx, nested NMIs will corrupt it */
movq 11*8(%rsp), %rdx
/*
* Everything below this point can be preempted by a nested
* NMI if the first NMI took an exception. Repeated NMIs
* caused by an exception and nested NMI will start here, and
* can still be preempted by another NMI.
*/
restart_nmi:
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
subq $ORIG_RAX-R15, %rsp subq $ORIG_RAX-R15, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
/*
* Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
* as we should not be calling schedule in NMI context.
* Even with normal interrupts enabled. An NMI should not be
* setting NEED_RESCHED or anything that normal interrupts and
* exceptions might do.
*/
call save_paranoid call save_paranoid
DEFAULT_FRAME 0 DEFAULT_FRAME 0
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp,%rdi movq %rsp,%rdi
movq $-1,%rsi movq $-1,%rsi
call do_nmi call do_nmi
#ifdef CONFIG_TRACE_IRQFLAGS
/* paranoidexit; without TRACE_IRQS_OFF */
/* ebx: no swapgs flag */
DISABLE_INTERRUPTS(CLBR_NONE)
testl %ebx,%ebx /* swapgs needed? */ testl %ebx,%ebx /* swapgs needed? */
jnz nmi_restore jnz nmi_restore
testl $3,CS(%rsp)
jnz nmi_userspace
nmi_swapgs: nmi_swapgs:
SWAPGS_UNSAFE_STACK SWAPGS_UNSAFE_STACK
nmi_restore: nmi_restore:
RESTORE_ALL 8 RESTORE_ALL 8
/* Clear the NMI executing stack variable */
movq $0, 10*8(%rsp)
jmp irq_return jmp irq_return
nmi_userspace:
GET_THREAD_INFO(%rcx)
movl TI_flags(%rcx),%ebx
andl $_TIF_WORK_MASK,%ebx
jz nmi_swapgs
movq %rsp,%rdi /* &pt_regs */
call sync_regs
movq %rax,%rsp /* switch stack for scheduling */
testl $_TIF_NEED_RESCHED,%ebx
jnz nmi_schedule
movl %ebx,%edx /* arg3: thread flags */
ENABLE_INTERRUPTS(CLBR_NONE)
xorl %esi,%esi /* arg2: oldset */
movq %rsp,%rdi /* arg1: &pt_regs */
call do_notify_resume
DISABLE_INTERRUPTS(CLBR_NONE)
jmp nmi_userspace
nmi_schedule:
ENABLE_INTERRUPTS(CLBR_ANY)
call schedule
DISABLE_INTERRUPTS(CLBR_ANY)
jmp nmi_userspace
CFI_ENDPROC
#else
jmp paranoid_exit
CFI_ENDPROC CFI_ENDPROC
#endif
END(nmi) END(nmi)
/*
* If an NMI hit an iret because of an exception or breakpoint,
* it can lose its NMI context, and a nested NMI may come in.
* In that case, the nested NMI will change the preempted NMI's
* stack to jump to here when it does the final iret.
*/
repeat_nmi:
INTR_FRAME
/* Update the stack variable to say we are still in NMI */
movq $1, 5*8(%rsp)
/* copy the saved stack back to copy stack */
.rept 5
pushq_cfi 4*8(%rsp)
.endr
jmp restart_nmi
CFI_ENDPROC
end_repeat_nmi:
ENTRY(ignore_sysret) ENTRY(ignore_sysret)
CFI_STARTPROC CFI_STARTPROC
mov $-ENOSYS,%eax mov $-ENOSYS,%eax
......
...@@ -417,6 +417,10 @@ ENTRY(phys_base) ...@@ -417,6 +417,10 @@ ENTRY(phys_base)
ENTRY(idt_table) ENTRY(idt_table)
.skip IDT_ENTRIES * 16 .skip IDT_ENTRIES * 16
.align L1_CACHE_BYTES
ENTRY(nmi_idt_table)
.skip IDT_ENTRIES * 16
__PAGE_ALIGNED_BSS __PAGE_ALIGNED_BSS
.align PAGE_SIZE .align PAGE_SIZE
ENTRY(empty_zero_page) ENTRY(empty_zero_page)
......
...@@ -405,9 +405,108 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) ...@@ -405,9 +405,108 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
unknown_nmi_error(reason, regs); unknown_nmi_error(reason, regs);
} }
/*
* NMIs can hit breakpoints which will cause it to lose its
* NMI context with the CPU when the breakpoint does an iret.
*/
#ifdef CONFIG_X86_32
/*
* For i386, NMIs use the same stack as the kernel, and we can
* add a workaround to the iret problem in C. Simply have 3 states
* the NMI can be in.
*
* 1) not running
* 2) executing
* 3) latched
*
* When no NMI is in progress, it is in the "not running" state.
* When an NMI comes in, it goes into the "executing" state.
* Normally, if another NMI is triggered, it does not interrupt
* the running NMI and the HW will simply latch it so that when
* the first NMI finishes, it will restart the second NMI.
* (Note, the latch is binary, thus multiple NMIs triggering,
* when one is running, are ignored. Only one NMI is restarted.)
*
* If an NMI hits a breakpoint that executes an iret, another
* NMI can preempt it. We do not want to allow this new NMI
* to run, but we want to execute it when the first one finishes.
* We set the state to "latched", and the first NMI will perform
* an cmpxchg on the state, and if it doesn't successfully
* reset the state to "not running" it will restart the next
* NMI.
*/
enum nmi_states {
NMI_NOT_RUNNING,
NMI_EXECUTING,
NMI_LATCHED,
};
static DEFINE_PER_CPU(enum nmi_states, nmi_state);
#define nmi_nesting_preprocess(regs) \
do { \
if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \
__get_cpu_var(nmi_state) = NMI_LATCHED; \
return; \
} \
nmi_restart: \
__get_cpu_var(nmi_state) = NMI_EXECUTING; \
} while (0)
#define nmi_nesting_postprocess() \
do { \
if (cmpxchg(&__get_cpu_var(nmi_state), \
NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \
goto nmi_restart; \
} while (0)
#else /* x86_64 */
/*
* In x86_64 things are a bit more difficult. This has the same problem
* where an NMI hitting a breakpoint that calls iret will remove the
* NMI context, allowing a nested NMI to enter. What makes this more
* difficult is that both NMIs and breakpoints have their own stack.
* When a new NMI or breakpoint is executed, the stack is set to a fixed
* point. If an NMI is nested, it will have its stack set at that same
* fixed address that the first NMI had, and will start corrupting the
* stack. This is handled in entry_64.S, but the same problem exists with
* the breakpoint stack.
*
* If a breakpoint is being processed, and the debug stack is being used,
* if an NMI comes in and also hits a breakpoint, the stack pointer
* will be set to the same fixed address as the breakpoint that was
* interrupted, causing that stack to be corrupted. To handle this case,
* check if the stack that was interrupted is the debug stack, and if
* so, change the IDT so that new breakpoints will use the current stack
* and not switch to the fixed address. On return of the NMI, switch back
* to the original IDT.
*/
static DEFINE_PER_CPU(int, update_debug_stack);
static inline void nmi_nesting_preprocess(struct pt_regs *regs)
{
/*
* If we interrupted a breakpoint, it is possible that
* the nmi handler will have breakpoints too. We need to
* change the IDT such that breakpoints that happen here
* continue to use the NMI stack.
*/
if (unlikely(is_debug_stack(regs->sp))) {
debug_stack_set_zero();
__get_cpu_var(update_debug_stack) = 1;
}
}
static inline void nmi_nesting_postprocess(void)
{
if (unlikely(__get_cpu_var(update_debug_stack)))
debug_stack_reset();
}
#endif
dotraplinkage notrace __kprobes void dotraplinkage notrace __kprobes void
do_nmi(struct pt_regs *regs, long error_code) do_nmi(struct pt_regs *regs, long error_code)
{ {
nmi_nesting_preprocess(regs);
nmi_enter(); nmi_enter();
inc_irq_stat(__nmi_count); inc_irq_stat(__nmi_count);
...@@ -416,6 +515,9 @@ do_nmi(struct pt_regs *regs, long error_code) ...@@ -416,6 +515,9 @@ do_nmi(struct pt_regs *regs, long error_code)
default_do_nmi(regs); default_do_nmi(regs);
nmi_exit(); nmi_exit();
/* On i386, may loop back to preprocess */
nmi_nesting_postprocess();
} }
void stop_nmi(void) void stop_nmi(void)
......
...@@ -316,9 +316,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) ...@@ -316,9 +316,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
return; return;
#endif #endif
/*
* Let others (NMI) know that the debug stack is in use
* as we may switch to the interrupt stack.
*/
debug_stack_usage_inc();
preempt_conditional_sti(regs); preempt_conditional_sti(regs);
do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
preempt_conditional_cli(regs); preempt_conditional_cli(regs);
debug_stack_usage_dec();
} }
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
...@@ -411,6 +417,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) ...@@ -411,6 +417,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
SIGTRAP) == NOTIFY_STOP) SIGTRAP) == NOTIFY_STOP)
return; return;
/*
* Let others (NMI) know that the debug stack is in use
* as we may switch to the interrupt stack.
*/
debug_stack_usage_inc();
/* It's safe to allow irq's after DR6 has been saved */ /* It's safe to allow irq's after DR6 has been saved */
preempt_conditional_sti(regs); preempt_conditional_sti(regs);
...@@ -418,6 +430,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) ...@@ -418,6 +430,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
handle_vm86_trap((struct kernel_vm86_regs *) regs, handle_vm86_trap((struct kernel_vm86_regs *) regs,
error_code, 1); error_code, 1);
preempt_conditional_cli(regs); preempt_conditional_cli(regs);
debug_stack_usage_dec();
return; return;
} }
...@@ -437,6 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) ...@@ -437,6 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
send_sigtrap(tsk, regs, error_code, si_code); send_sigtrap(tsk, regs, error_code, si_code);
preempt_conditional_cli(regs); preempt_conditional_cli(regs);
debug_stack_usage_dec();
return; return;
} }
...@@ -723,4 +737,10 @@ void __init trap_init(void) ...@@ -723,4 +737,10 @@ void __init trap_init(void)
cpu_init(); cpu_init();
x86_init.irqs.trap_init(); x86_init.irqs.trap_init();
#ifdef CONFIG_X86_64
memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
set_nmi_gate(1, &debug);
set_nmi_gate(3, &int3);
#endif
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment