Commit f2b37575 authored by Andy Lutomirski's avatar Andy Lutomirski Committed by Ingo Molnar

x86/entry: Vastly simplify SYSENTER TF (single-step) handling

Due to a blatant design error, SYSENTER doesn't clear TF (single-step).

As a result, if a user does SYSENTER with TF set, we will single-step
through the kernel until something clears TF.  There is absolutely
nothing we can do to prevent this short of turning off SYSENTER [1].

Simplify the handling considerably with two changes:

  1. We already sanitize EFLAGS in SYSENTER to clear NT and AC.  We can
     add TF to that list of flags to sanitize with no overhead whatsoever.

  2. Teach do_debug() to ignore single-step traps in the SYSENTER prologue.

That's all we need to do.

Don't get too excited -- our handling is still buggy on 32-bit
kernels.  There's nothing wrong with the SYSENTER code itself, but
the #DB prologue has a clever fixup for traps on the very first
instruction of entry_SYSENTER_32, and the fixup doesn't work quite
correctly.  The next two patches will fix that.

[1] We could probably prevent it by forcing BTF on at all times and
    making sure we clear TF before any branches in the SYSENTER
    code.  Needless to say, this is a bad idea.
Signed-off-by: default avatarAndy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a30d2ea06fe4b621fe6a9ef911b02c0f38feb6f2.1457578375.git.luto@kernel.orgSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 8bb56436
...@@ -287,7 +287,26 @@ need_resched: ...@@ -287,7 +287,26 @@ need_resched:
END(resume_kernel) END(resume_kernel)
#endif #endif
# SYSENTER call handler stub GLOBAL(__begin_SYSENTER_singlestep_region)
/*
* All code from here through __end_SYSENTER_singlestep_region is subject
* to being single-stepped if a user program sets TF and executes SYSENTER.
* There is absolutely nothing that we can do to prevent this from happening
* (thanks Intel!). To keep our handling of this situation as simple as
* possible, we handle TF just like AC and NT, except that our #DB handler
* will ignore all of the single-step traps generated in this range.
*/
#ifdef CONFIG_XEN
/*
* Xen doesn't set %esp to be precisely what the normal SYSENTER
* entry point expects, so fix it up before using the normal path.
*/
ENTRY(xen_sysenter_target)
addl $5*4, %esp /* remove xen-provided frame */
jmp sysenter_past_esp
#endif
ENTRY(entry_SYSENTER_32) ENTRY(entry_SYSENTER_32)
movl TSS_sysenter_sp0(%esp), %esp movl TSS_sysenter_sp0(%esp), %esp
sysenter_past_esp: sysenter_past_esp:
...@@ -301,19 +320,25 @@ sysenter_past_esp: ...@@ -301,19 +320,25 @@ sysenter_past_esp:
SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
/* /*
* SYSENTER doesn't filter flags, so we need to clear NT and AC * SYSENTER doesn't filter flags, so we need to clear NT, AC
* ourselves. To save a few cycles, we can check whether * and TF ourselves. To save a few cycles, we can check whether
* either was set instead of doing an unconditional popfq. * either was set instead of doing an unconditional popfq.
* This needs to happen before enabling interrupts so that * This needs to happen before enabling interrupts so that
* we don't get preempted with NT set. * we don't get preempted with NT set.
* *
* If TF is set, we will single-step all the way to here -- do_debug
* will ignore all the traps. (Yes, this is slow, but so is
* single-stepping in general. This allows us to avoid having
* a more complicated code to handle the case where a user program
* forces us to single-step through the SYSENTER entry code.)
*
* NB.: .Lsysenter_fix_flags is a label with the code under it moved * NB.: .Lsysenter_fix_flags is a label with the code under it moved
* out-of-line as an optimization: NT is unlikely to be set in the * out-of-line as an optimization: NT is unlikely to be set in the
* majority of the cases and instead of polluting the I$ unnecessarily, * majority of the cases and instead of polluting the I$ unnecessarily,
* we're keeping that code behind a branch which will predict as * we're keeping that code behind a branch which will predict as
* not-taken and therefore its instructions won't be fetched. * not-taken and therefore its instructions won't be fetched.
*/ */
testl $X86_EFLAGS_NT|X86_EFLAGS_AC, PT_EFLAGS(%esp) testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
jnz .Lsysenter_fix_flags jnz .Lsysenter_fix_flags
.Lsysenter_flags_fixed: .Lsysenter_flags_fixed:
...@@ -369,6 +394,7 @@ sysenter_past_esp: ...@@ -369,6 +394,7 @@ sysenter_past_esp:
pushl $X86_EFLAGS_FIXED pushl $X86_EFLAGS_FIXED
popfl popfl
jmp .Lsysenter_flags_fixed jmp .Lsysenter_flags_fixed
GLOBAL(__end_SYSENTER_singlestep_region)
ENDPROC(entry_SYSENTER_32) ENDPROC(entry_SYSENTER_32)
# system call handler stub # system call handler stub
...@@ -651,14 +677,6 @@ ENTRY(spurious_interrupt_bug) ...@@ -651,14 +677,6 @@ ENTRY(spurious_interrupt_bug)
END(spurious_interrupt_bug) END(spurious_interrupt_bug)
#ifdef CONFIG_XEN #ifdef CONFIG_XEN
/*
* Xen doesn't set %esp to be precisely what the normal SYSENTER
* entry point expects, so fix it up before using the normal path.
*/
ENTRY(xen_sysenter_target)
addl $5*4, %esp /* remove xen-provided frame */
jmp sysenter_past_esp
ENTRY(xen_hypervisor_callback) ENTRY(xen_hypervisor_callback)
pushl $-1 /* orig_ax = -1 => not a system call */ pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL SAVE_ALL
......
...@@ -94,13 +94,19 @@ ENTRY(entry_SYSENTER_compat) ...@@ -94,13 +94,19 @@ ENTRY(entry_SYSENTER_compat)
* This needs to happen before enabling interrupts so that * This needs to happen before enabling interrupts so that
* we don't get preempted with NT set. * we don't get preempted with NT set.
* *
* If TF is set, we will single-step all the way to here -- do_debug
* will ignore all the traps. (Yes, this is slow, but so is
* single-stepping in general. This allows us to avoid having
* a more complicated code to handle the case where a user program
* forces us to single-step through the SYSENTER entry code.)
*
* NB.: .Lsysenter_fix_flags is a label with the code under it moved * NB.: .Lsysenter_fix_flags is a label with the code under it moved
* out-of-line as an optimization: NT is unlikely to be set in the * out-of-line as an optimization: NT is unlikely to be set in the
* majority of the cases and instead of polluting the I$ unnecessarily, * majority of the cases and instead of polluting the I$ unnecessarily,
* we're keeping that code behind a branch which will predict as * we're keeping that code behind a branch which will predict as
* not-taken and therefore its instructions won't be fetched. * not-taken and therefore its instructions won't be fetched.
*/ */
testl $X86_EFLAGS_NT|X86_EFLAGS_AC, EFLAGS(%rsp) testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp)
jnz .Lsysenter_fix_flags jnz .Lsysenter_fix_flags
.Lsysenter_flags_fixed: .Lsysenter_flags_fixed:
...@@ -121,6 +127,7 @@ ENTRY(entry_SYSENTER_compat) ...@@ -121,6 +127,7 @@ ENTRY(entry_SYSENTER_compat)
pushq $X86_EFLAGS_FIXED pushq $X86_EFLAGS_FIXED
popfq popfq
jmp .Lsysenter_flags_fixed jmp .Lsysenter_flags_fixed
GLOBAL(__end_entry_SYSENTER_compat)
ENDPROC(entry_SYSENTER_compat) ENDPROC(entry_SYSENTER_compat)
/* /*
......
...@@ -7,12 +7,23 @@ ...@@ -7,12 +7,23 @@
void syscall_init(void); void syscall_init(void);
#ifdef CONFIG_X86_64
void entry_SYSCALL_64(void); void entry_SYSCALL_64(void);
void entry_SYSCALL_compat(void); #endif
#ifdef CONFIG_X86_32
void entry_INT80_32(void); void entry_INT80_32(void);
void entry_INT80_compat(void);
void entry_SYSENTER_32(void); void entry_SYSENTER_32(void);
void __begin_SYSENTER_singlestep_region(void);
void __end_SYSENTER_singlestep_region(void);
#endif
#ifdef CONFIG_IA32_EMULATION
void entry_SYSENTER_compat(void); void entry_SYSENTER_compat(void);
void __end_entry_SYSENTER_compat(void);
void entry_SYSCALL_compat(void);
void entry_INT80_compat(void);
#endif
void x86_configure_nx(void); void x86_configure_nx(void);
void x86_report_nx(void); void x86_report_nx(void);
......
...@@ -559,6 +559,29 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) ...@@ -559,6 +559,29 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
NOKPROBE_SYMBOL(fixup_bad_iret); NOKPROBE_SYMBOL(fixup_bad_iret);
#endif #endif
static bool is_sysenter_singlestep(struct pt_regs *regs)
{
/*
* We don't try for precision here. If we're anywhere in the region of
* code that can be single-stepped in the SYSENTER entry path, then
* assume that this is a useless single-step trap due to SYSENTER
* being invoked with TF set. (We don't know in advance exactly
* which instructions will be hit because BTF could plausibly
* be set.)
*/
#ifdef CONFIG_X86_32
return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
(unsigned long)__end_SYSENTER_singlestep_region -
(unsigned long)__begin_SYSENTER_singlestep_region;
#elif defined(CONFIG_IA32_EMULATION)
return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
(unsigned long)__end_entry_SYSENTER_compat -
(unsigned long)entry_SYSENTER_compat;
#else
return false;
#endif
}
/* /*
* Our handling of the processor debug registers is non-trivial. * Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore * We do not clear them on entry and exit from the kernel. Therefore
...@@ -616,6 +639,18 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) ...@@ -616,6 +639,18 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
*/ */
clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
is_sysenter_singlestep(regs))) {
dr6 &= ~DR_STEP;
if (!dr6)
goto exit;
/*
* else we might have gotten a single-step trap and hit a
* watchpoint at the same time, in which case we should fall
* through and handle the watchpoint.
*/
}
/* /*
* If dr6 has no reason to give us about the origin of this trap, * If dr6 has no reason to give us about the origin of this trap,
* then it's very likely the result of an icebp/int01 trap. * then it's very likely the result of an icebp/int01 trap.
...@@ -624,7 +659,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) ...@@ -624,7 +659,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
if (!dr6 && user_mode(regs)) if (!dr6 && user_mode(regs))
user_icebp = 1; user_icebp = 1;
/* Catch kmemcheck conditions first of all! */ /* Catch kmemcheck conditions! */
if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
goto exit; goto exit;
...@@ -659,14 +694,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) ...@@ -659,14 +694,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
goto exit; goto exit;
} }
/* if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
* Single-stepping through system calls: ignore any exceptions in /*
* kernel space, but re-enable TF when returning to user mode. * Historical junk that used to handle SYSENTER single-stepping.
* * This should be unreachable now. If we survive for a while
* We already checked v86 mode above, so we can check for kernel mode * without anyone hitting this warning, we'll turn this into
* by just checking the CPL of CS. * an oops.
*/ */
if ((dr6 & DR_STEP) && !user_mode(regs)) {
tsk->thread.debugreg6 &= ~DR_STEP; tsk->thread.debugreg6 &= ~DR_STEP;
set_tsk_thread_flag(tsk, TIF_SINGLESTEP); set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
regs->flags &= ~X86_EFLAGS_TF; regs->flags &= ~X86_EFLAGS_TF;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment