Commit 0e1dbccd authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
 "Two families of fixes:

   - Fix an FPU context related boot crash on newer x86 hardware with
     larger context sizes than what most people test.  To fix this
     without ugly kludges or extensive reverts we had to touch core task
     allocator, to allow x86 to determine the task size dynamically, at
     boot time.

     I've tested it on a number of x86 platforms, and I cross-built it
     to a handful of architectures:

                                        (warns)               (warns)
       testing     x86-64:  -git:  pass (    0),  -tip:  pass (    0)
       testing     x86-32:  -git:  pass (    0),  -tip:  pass (    0)
       testing        arm:  -git:  pass ( 1359),  -tip:  pass ( 1359)
       testing       cris:  -git:  pass ( 1031),  -tip:  pass ( 1031)
       testing       m32r:  -git:  pass ( 1135),  -tip:  pass ( 1135)
       testing       m68k:  -git:  pass ( 1471),  -tip:  pass ( 1471)
       testing       mips:  -git:  pass ( 1162),  -tip:  pass ( 1162)
       testing    mn10300:  -git:  pass ( 1058),  -tip:  pass ( 1058)
       testing     parisc:  -git:  pass ( 1846),  -tip:  pass ( 1846)
       testing      sparc:  -git:  pass ( 1185),  -tip:  pass ( 1185)

     ... so I hope the cross-arch impact 'none', as intended.

     (by Dave Hansen)

   - Fix various NMI handling related bugs unearthed by the big asm code
     rewrite and generally make the NMI code more robust and more
     maintainable while at it.  These changes are a bit late in the
     cycle, I hope they are still acceptable.

     (by Andy Lutomirski)"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/fpu, sched: Introduce CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT and use it on x86
  x86/fpu, sched: Dynamically allocate 'struct fpu'
  x86/entry/64, x86/nmi/64: Add CONFIG_DEBUG_ENTRY NMI testing code
  x86/nmi/64: Make the "NMI executing" variable more consistent
  x86/nmi/64: Minor asm simplification
  x86/nmi/64: Use DF to avoid userspace RSP confusing nested NMI detection
  x86/nmi/64: Reorder nested NMI checks
  x86/nmi/64: Improve nested NMI comments
  x86/nmi/64: Switch stacks on userspace NMI entry
  x86/nmi/64: Remove asm code that saves CR2
  x86/nmi: Enable nested do_nmi() handling for 64-bit kernels
parents dae57fb6 5aaeb5c0
...@@ -221,6 +221,10 @@ config ARCH_TASK_STRUCT_ALLOCATOR ...@@ -221,6 +221,10 @@ config ARCH_TASK_STRUCT_ALLOCATOR
config ARCH_THREAD_INFO_ALLOCATOR config ARCH_THREAD_INFO_ALLOCATOR
bool bool
# Select if arch wants to size task_struct dynamically via arch_task_struct_size:
config ARCH_WANTS_DYNAMIC_TASK_STRUCT
bool
config HAVE_REGS_AND_STACK_ACCESS_API config HAVE_REGS_AND_STACK_ACCESS_API
bool bool
help help
......
...@@ -41,6 +41,7 @@ config X86 ...@@ -41,6 +41,7 @@ config X86
select ARCH_USE_CMPXCHG_LOCKREF if X86_64 select ARCH_USE_CMPXCHG_LOCKREF if X86_64
select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_IPC_PARSE_VERSION if X86_32 select ARCH_WANT_IPC_PARSE_VERSION if X86_32
select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_OPTIONAL_GPIOLIB
......
...@@ -297,6 +297,18 @@ config OPTIMIZE_INLINING ...@@ -297,6 +297,18 @@ config OPTIMIZE_INLINING
If unsure, say N. If unsure, say N.
config DEBUG_ENTRY
bool "Debug low-level entry code"
depends on DEBUG_KERNEL
---help---
This option enables sanity checks in x86's low-level entry code.
Some of these sanity checks may slow down kernel entries and
exits or otherwise impact performance.
This is currently used to help test NMI code.
If unsure, say N.
config DEBUG_NMI_SELFTEST config DEBUG_NMI_SELFTEST
bool "NMI Selftest" bool "NMI Selftest"
depends on DEBUG_KERNEL && X86_LOCAL_APIC depends on DEBUG_KERNEL && X86_LOCAL_APIC
......
...@@ -1237,11 +1237,12 @@ ENTRY(nmi) ...@@ -1237,11 +1237,12 @@ ENTRY(nmi)
* If the variable is not set and the stack is not the NMI * If the variable is not set and the stack is not the NMI
* stack then: * stack then:
* o Set the special variable on the stack * o Set the special variable on the stack
* o Copy the interrupt frame into a "saved" location on the stack * o Copy the interrupt frame into an "outermost" location on the
* o Copy the interrupt frame into a "copy" location on the stack * stack
* o Copy the interrupt frame into an "iret" location on the stack
* o Continue processing the NMI * o Continue processing the NMI
* If the variable is set or the previous stack is the NMI stack: * If the variable is set or the previous stack is the NMI stack:
* o Modify the "copy" location to jump to the repeate_nmi * o Modify the "iret" location to jump to the repeat_nmi
* o return back to the first NMI * o return back to the first NMI
* *
* Now on exit of the first NMI, we first clear the stack variable * Now on exit of the first NMI, we first clear the stack variable
...@@ -1250,31 +1251,151 @@ ENTRY(nmi) ...@@ -1250,31 +1251,151 @@ ENTRY(nmi)
* a nested NMI that updated the copy interrupt stack frame, a * a nested NMI that updated the copy interrupt stack frame, a
* jump will be made to the repeat_nmi code that will handle the second * jump will be made to the repeat_nmi code that will handle the second
* NMI. * NMI.
*
* However, espfix prevents us from directly returning to userspace
* with a single IRET instruction. Similarly, IRET to user mode
* can fault. We therefore handle NMIs from user space like
* other IST entries.
*/ */
/* Use %rdx as our temp variable throughout */ /* Use %rdx as our temp variable throughout */
pushq %rdx pushq %rdx
testb $3, CS-RIP+8(%rsp)
jz .Lnmi_from_kernel
/*
* NMI from user mode. We need to run on the thread stack, but we
* can't go through the normal entry paths: NMIs are masked, and
* we don't want to enable interrupts, because then we'll end
* up in an awkward situation in which IRQs are on but NMIs
* are off.
*/
SWAPGS
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
pushq 5*8(%rdx) /* pt_regs->ss */
pushq 4*8(%rdx) /* pt_regs->rsp */
pushq 3*8(%rdx) /* pt_regs->flags */
pushq 2*8(%rdx) /* pt_regs->cs */
pushq 1*8(%rdx) /* pt_regs->rip */
pushq $-1 /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq (%rdx) /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq %rax /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 */
pushq %r9 /* pt_regs->r9 */
pushq %r10 /* pt_regs->r10 */
pushq %r11 /* pt_regs->r11 */
pushq %rbx /* pt_regs->rbx */
pushq %rbp /* pt_regs->rbp */
pushq %r12 /* pt_regs->r12 */
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
/* /*
* If %cs was not the kernel segment, then the NMI triggered in user * At this point we no longer need to worry about stack damage
* space, which means it is definitely not nested. * due to nesting -- we're on the normal thread stack and we're
* done with the NMI stack.
*/ */
cmpl $__KERNEL_CS, 16(%rsp)
jne first_nmi movq %rsp, %rdi
movq $-1, %rsi
call do_nmi
/* /*
* Check the special variable on the stack to see if NMIs are * Return back to user mode. We must *not* do the normal exit
* executing. * work, because we don't want to enable interrupts. Fortunately,
* do_nmi doesn't modify pt_regs.
*/
SWAPGS
jmp restore_c_regs_and_iret
.Lnmi_from_kernel:
/*
* Here's what our stack frame will look like:
* +---------------------------------------------------------+
* | original SS |
* | original Return RSP |
* | original RFLAGS |
* | original CS |
* | original RIP |
* +---------------------------------------------------------+
* | temp storage for rdx |
* +---------------------------------------------------------+
* | "NMI executing" variable |
* +---------------------------------------------------------+
* | iret SS } Copied from "outermost" frame |
* | iret Return RSP } on each loop iteration; overwritten |
* | iret RFLAGS } by a nested NMI to force another |
* | iret CS } iteration if needed. |
* | iret RIP } |
* +---------------------------------------------------------+
* | outermost SS } initialized in first_nmi; |
* | outermost Return RSP } will not be changed before |
* | outermost RFLAGS } NMI processing is done. |
* | outermost CS } Copied to "iret" frame on each |
* | outermost RIP } iteration. |
* +---------------------------------------------------------+
* | pt_regs |
* +---------------------------------------------------------+
*
* The "original" frame is used by hardware. Before re-enabling
* NMIs, we need to be done with it, and we need to leave enough
* space for the asm code here.
*
* We return by executing IRET while RSP points to the "iret" frame.
* That will either return for real or it will loop back into NMI
* processing.
*
* The "outermost" frame is copied to the "iret" frame on each
* iteration of the loop, so each iteration starts with the "iret"
* frame pointing to the final return target.
*/
/*
* Determine whether we're a nested NMI.
*
* If we interrupted kernel code between repeat_nmi and
* end_repeat_nmi, then we are a nested NMI. We must not
* modify the "iret" frame because it's being written by
* the outer NMI. That's okay; the outer NMI handler is
* about to about to call do_nmi anyway, so we can just
* resume the outer NMI.
*/
movq $repeat_nmi, %rdx
cmpq 8(%rsp), %rdx
ja 1f
movq $end_repeat_nmi, %rdx
cmpq 8(%rsp), %rdx
ja nested_nmi_out
1:
/*
* Now check "NMI executing". If it's set, then we're nested.
* This will not detect if we interrupted an outer NMI just
* before IRET.
*/ */
cmpl $1, -8(%rsp) cmpl $1, -8(%rsp)
je nested_nmi je nested_nmi
/* /*
* Now test if the previous stack was an NMI stack. * Now test if the previous stack was an NMI stack. This covers
* We need the double check. We check the NMI stack to satisfy the * the case where we interrupt an outer NMI after it clears
* race when the first NMI clears the variable before returning. * "NMI executing" but before IRET. We need to be careful, though:
* We check the variable because the first NMI could be in a * there is one case in which RSP could point to the NMI stack
* breakpoint routine using a breakpoint stack. * despite there being no NMI active: naughty userspace controls
* RSP at the very beginning of the SYSCALL targets. We can
* pull a fast one on naughty userspace, though: we program
* SYSCALL to mask DF, so userspace cannot cause DF to be set
* if it controls the kernel's RSP. We set DF before we clear
* "NMI executing".
*/ */
lea 6*8(%rsp), %rdx lea 6*8(%rsp), %rdx
/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
...@@ -1286,25 +1407,20 @@ ENTRY(nmi) ...@@ -1286,25 +1407,20 @@ ENTRY(nmi)
cmpq %rdx, 4*8(%rsp) cmpq %rdx, 4*8(%rsp)
/* If it is below the NMI stack, it is a normal NMI */ /* If it is below the NMI stack, it is a normal NMI */
jb first_nmi jb first_nmi
/* Ah, it is within the NMI stack, treat it as nested */
/* Ah, it is within the NMI stack. */
testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
jz first_nmi /* RSP was user controlled. */
/* This is a nested NMI. */
nested_nmi: nested_nmi:
/* /*
* Do nothing if we interrupted the fixup in repeat_nmi. * Modify the "iret" frame to point to repeat_nmi, forcing another
* It's about to repeat the NMI handler, so we are fine * iteration of NMI handling.
* with ignoring this one.
*/ */
movq $repeat_nmi, %rdx subq $8, %rsp
cmpq 8(%rsp), %rdx
ja 1f
movq $end_repeat_nmi, %rdx
cmpq 8(%rsp), %rdx
ja nested_nmi_out
1:
/* Set up the interrupted NMIs stack to jump to repeat_nmi */
leaq -1*8(%rsp), %rdx
movq %rdx, %rsp
leaq -10*8(%rsp), %rdx leaq -10*8(%rsp), %rdx
pushq $__KERNEL_DS pushq $__KERNEL_DS
pushq %rdx pushq %rdx
...@@ -1318,61 +1434,42 @@ nested_nmi: ...@@ -1318,61 +1434,42 @@ nested_nmi:
nested_nmi_out: nested_nmi_out:
popq %rdx popq %rdx
/* No need to check faults here */ /* We are returning to kernel mode, so this cannot result in a fault. */
INTERRUPT_RETURN INTERRUPT_RETURN
first_nmi: first_nmi:
/* /* Restore rdx. */
* Because nested NMIs will use the pushed location that we
* stored in rdx, we must keep that space available.
* Here's what our stack frame will look like:
* +-------------------------+
* | original SS |
* | original Return RSP |
* | original RFLAGS |
* | original CS |
* | original RIP |
* +-------------------------+
* | temp storage for rdx |
* +-------------------------+
* | NMI executing variable |
* +-------------------------+
* | copied SS |
* | copied Return RSP |
* | copied RFLAGS |
* | copied CS |
* | copied RIP |
* +-------------------------+
* | Saved SS |
* | Saved Return RSP |
* | Saved RFLAGS |
* | Saved CS |
* | Saved RIP |
* +-------------------------+
* | pt_regs |
* +-------------------------+
*
* The saved stack frame is used to fix up the copied stack frame
* that a nested NMI may change to make the interrupted NMI iret jump
* to the repeat_nmi. The original stack frame and the temp storage
* is also used by nested NMIs and can not be trusted on exit.
*/
/* Do not pop rdx, nested NMIs will corrupt that part of the stack */
movq (%rsp), %rdx movq (%rsp), %rdx
/* Set the NMI executing variable on the stack. */ /* Make room for "NMI executing". */
pushq $1 pushq $0
/* Leave room for the "copied" frame */ /* Leave room for the "iret" frame */
subq $(5*8), %rsp subq $(5*8), %rsp
/* Copy the stack frame to the Saved frame */ /* Copy the "original" frame to the "outermost" frame */
.rept 5 .rept 5
pushq 11*8(%rsp) pushq 11*8(%rsp)
.endr .endr
/* Everything up to here is safe from nested NMIs */ /* Everything up to here is safe from nested NMIs */
#ifdef CONFIG_DEBUG_ENTRY
/*
* For ease of testing, unmask NMIs right away. Disabled by
* default because IRET is very expensive.
*/
pushq $0 /* SS */
pushq %rsp /* RSP (minus 8 because of the previous push) */
addq $8, (%rsp) /* Fix up RSP */
pushfq /* RFLAGS */
pushq $__KERNEL_CS /* CS */
pushq $1f /* RIP */
INTERRUPT_RETURN /* continues at repeat_nmi below */
1:
#endif
repeat_nmi:
/* /*
* If there was a nested NMI, the first NMI's iret will return * If there was a nested NMI, the first NMI's iret will return
* here. But NMIs are still enabled and we can take another * here. But NMIs are still enabled and we can take another
...@@ -1381,16 +1478,20 @@ first_nmi: ...@@ -1381,16 +1478,20 @@ first_nmi:
* it will just return, as we are about to repeat an NMI anyway. * it will just return, as we are about to repeat an NMI anyway.
* This makes it safe to copy to the stack frame that a nested * This makes it safe to copy to the stack frame that a nested
* NMI will update. * NMI will update.
*
* RSP is pointing to "outermost RIP". gsbase is unknown, but, if
* we're repeating an NMI, gsbase has the same value that it had on
* the first iteration. paranoid_entry will load the kernel
* gsbase if needed before we call do_nmi. "NMI executing"
* is zero.
*/ */
repeat_nmi: movq $1, 10*8(%rsp) /* Set "NMI executing". */
/* /*
* Update the stack variable to say we are still in NMI (the update * Copy the "outermost" frame to the "iret" frame. NMIs that nest
* is benign for the non-repeat case, where 1 was pushed just above * here must not modify the "iret" frame while we're writing to
* to this very stack slot). * it or it will end up containing garbage.
*/ */
movq $1, 10*8(%rsp)
/* Make another copy, this one may be modified by nested NMIs */
addq $(10*8), %rsp addq $(10*8), %rsp
.rept 5 .rept 5
pushq -6*8(%rsp) pushq -6*8(%rsp)
...@@ -1399,9 +1500,9 @@ repeat_nmi: ...@@ -1399,9 +1500,9 @@ repeat_nmi:
end_repeat_nmi: end_repeat_nmi:
/* /*
* Everything below this point can be preempted by a nested * Everything below this point can be preempted by a nested NMI.
* NMI if the first NMI took an exception and reset our iret stack * If this happens, then the inner NMI will change the "iret"
* so that we repeat another NMI. * frame to point back to repeat_nmi.
*/ */
pushq $-1 /* ORIG_RAX: no syscall to restart */ pushq $-1 /* ORIG_RAX: no syscall to restart */
ALLOC_PT_GPREGS_ON_STACK ALLOC_PT_GPREGS_ON_STACK
...@@ -1415,28 +1516,11 @@ end_repeat_nmi: ...@@ -1415,28 +1516,11 @@ end_repeat_nmi:
*/ */
call paranoid_entry call paranoid_entry
/*
* Save off the CR2 register. If we take a page fault in the NMI then
* it could corrupt the CR2 value. If the NMI preempts a page fault
* handler before it was able to read the CR2 register, and then the
* NMI itself takes a page fault, the page fault that was preempted
* will read the information from the NMI page fault and not the
* origin fault. Save it off and restore it if it changes.
* Use the r12 callee-saved register.
*/
movq %cr2, %r12
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp, %rdi movq %rsp, %rdi
movq $-1, %rsi movq $-1, %rsi
call do_nmi call do_nmi
/* Did the NMI take a page fault? Restore cr2 if it did */
movq %cr2, %rcx
cmpq %rcx, %r12
je 1f
movq %r12, %cr2
1:
testl %ebx, %ebx /* swapgs needed? */ testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore jnz nmi_restore
nmi_swapgs: nmi_swapgs:
...@@ -1444,11 +1528,26 @@ nmi_swapgs: ...@@ -1444,11 +1528,26 @@ nmi_swapgs:
nmi_restore: nmi_restore:
RESTORE_EXTRA_REGS RESTORE_EXTRA_REGS
RESTORE_C_REGS RESTORE_C_REGS
/* Pop the extra iret frame at once */
/* Point RSP at the "iret" frame. */
REMOVE_PT_GPREGS_FROM_STACK 6*8 REMOVE_PT_GPREGS_FROM_STACK 6*8
/* Clear the NMI executing stack variable */ /*
movq $0, 5*8(%rsp) * Clear "NMI executing". Set DF first so that we can easily
* distinguish the remaining code between here and IRET from
* the SYSCALL entry and exit paths. On a native kernel, we
* could just inspect RIP, but, on paravirt kernels,
* INTERRUPT_RETURN can translate into a jump into a
* hypercall page.
*/
std
movq $0, 5*8(%rsp) /* clear "NMI executing" */
/*
* INTERRUPT_RETURN reads the "iret" frame and exits the NMI
* stack in a single instruction. We are returning to kernel
* mode, so this cannot result in a fault.
*/
INTERRUPT_RETURN INTERRUPT_RETURN
END(nmi) END(nmi)
......
...@@ -189,6 +189,7 @@ union fpregs_state { ...@@ -189,6 +189,7 @@ union fpregs_state {
struct fxregs_state fxsave; struct fxregs_state fxsave;
struct swregs_state soft; struct swregs_state soft;
struct xregs_state xsave; struct xregs_state xsave;
u8 __padding[PAGE_SIZE];
}; };
/* /*
...@@ -197,40 +198,6 @@ union fpregs_state { ...@@ -197,40 +198,6 @@ union fpregs_state {
* state fields: * state fields:
*/ */
struct fpu { struct fpu {
/*
* @state:
*
* In-memory copy of all FPU registers that we save/restore
* over context switches. If the task is using the FPU then
* the registers in the FPU are more recent than this state
* copy. If the task context-switches away then they get
* saved here and represent the FPU state.
*
* After context switches there may be a (short) time period
* during which the in-FPU hardware registers are unchanged
* and still perfectly match this state, if the tasks
* scheduled afterwards are not using the FPU.
*
* This is the 'lazy restore' window of optimization, which
* we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
*
* We detect whether a subsequent task uses the FPU via setting
* CR0::TS to 1, which causes any FPU use to raise a #NM fault.
*
* During this window, if the task gets scheduled again, we
* might be able to skip having to do a restore from this
* memory buffer to the hardware registers - at the cost of
* incurring the overhead of #NM fault traps.
*
* Note that on modern CPUs that support the XSAVEOPT (or other
* optimized XSAVE instructions), we don't use #NM traps anymore,
* as the hardware can track whether FPU registers need saving
* or not. On such CPUs we activate the non-lazy ('eagerfpu')
* logic, which unconditionally saves/restores all FPU state
* across context switches. (if FPU state exists.)
*/
union fpregs_state state;
/* /*
* @last_cpu: * @last_cpu:
* *
...@@ -288,6 +255,43 @@ struct fpu { ...@@ -288,6 +255,43 @@ struct fpu {
* deal with bursty apps that only use the FPU for a short time: * deal with bursty apps that only use the FPU for a short time:
*/ */
unsigned char counter; unsigned char counter;
/*
* @state:
*
* In-memory copy of all FPU registers that we save/restore
* over context switches. If the task is using the FPU then
* the registers in the FPU are more recent than this state
* copy. If the task context-switches away then they get
* saved here and represent the FPU state.
*
* After context switches there may be a (short) time period
* during which the in-FPU hardware registers are unchanged
* and still perfectly match this state, if the tasks
* scheduled afterwards are not using the FPU.
*
* This is the 'lazy restore' window of optimization, which
* we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
*
* We detect whether a subsequent task uses the FPU via setting
* CR0::TS to 1, which causes any FPU use to raise a #NM fault.
*
* During this window, if the task gets scheduled again, we
* might be able to skip having to do a restore from this
* memory buffer to the hardware registers - at the cost of
* incurring the overhead of #NM fault traps.
*
* Note that on modern CPUs that support the XSAVEOPT (or other
* optimized XSAVE instructions), we don't use #NM traps anymore,
* as the hardware can track whether FPU registers need saving
* or not. On such CPUs we activate the non-lazy ('eagerfpu')
* logic, which unconditionally saves/restores all FPU state
* across context switches. (if FPU state exists.)
*/
union fpregs_state state;
/*
* WARNING: 'state' is dynamically-sized. Do not put
* anything after it here.
*/
}; };
#endif /* _ASM_X86_FPU_H */ #endif /* _ASM_X86_FPU_H */
...@@ -390,9 +390,6 @@ struct thread_struct { ...@@ -390,9 +390,6 @@ struct thread_struct {
#endif #endif
unsigned long gs; unsigned long gs;
/* Floating point and extended processor state */
struct fpu fpu;
/* Save middle states of ptrace breakpoints */ /* Save middle states of ptrace breakpoints */
struct perf_event *ptrace_bps[HBP_NUM]; struct perf_event *ptrace_bps[HBP_NUM];
/* Debug status used for traps, single steps, etc... */ /* Debug status used for traps, single steps, etc... */
...@@ -418,6 +415,13 @@ struct thread_struct { ...@@ -418,6 +415,13 @@ struct thread_struct {
unsigned long iopl; unsigned long iopl;
/* Max allowed port in the bitmap, in bytes: */ /* Max allowed port in the bitmap, in bytes: */
unsigned io_bitmap_max; unsigned io_bitmap_max;
/* Floating point and extended processor state */
struct fpu fpu;
/*
* WARNING: 'fpu' is dynamically-sized. It *MUST* be at
* the end.
*/
}; };
/* /*
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <asm/fpu/internal.h> #include <asm/fpu/internal.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <linux/sched.h>
/* /*
* Initialize the TS bit in CR0 according to the style of context-switches * Initialize the TS bit in CR0 according to the style of context-switches
* we are using: * we are using:
...@@ -136,6 +138,43 @@ static void __init fpu__init_system_generic(void) ...@@ -136,6 +138,43 @@ static void __init fpu__init_system_generic(void)
unsigned int xstate_size; unsigned int xstate_size;
EXPORT_SYMBOL_GPL(xstate_size); EXPORT_SYMBOL_GPL(xstate_size);
/* Enforce that 'MEMBER' is the last field of 'TYPE': */
#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
/*
* We append the 'struct fpu' to the task_struct:
*/
static void __init fpu__init_task_struct_size(void)
{
int task_size = sizeof(struct task_struct);
/*
* Subtract off the static size of the register state.
* It potentially has a bunch of padding.
*/
task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
/*
* Add back the dynamically-calculated register state
* size.
*/
task_size += xstate_size;
/*
* We dynamically size 'struct fpu', so we require that
* it be at the end of 'thread_struct' and that
* 'thread_struct' be at the end of 'task_struct'. If
* you hit a compile error here, check the structure to
* see if something got added to the end.
*/
CHECK_MEMBER_AT_END_OF(struct fpu, state);
CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
arch_task_struct_size = task_size;
}
/* /*
* Set up the xstate_size based on the legacy FPU context size. * Set up the xstate_size based on the legacy FPU context size.
* *
...@@ -287,6 +326,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) ...@@ -287,6 +326,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
fpu__init_system_generic(); fpu__init_system_generic();
fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate_size_legacy();
fpu__init_system_xstate(); fpu__init_system_xstate();
fpu__init_task_struct_size();
fpu__init_system_ctx_switch(); fpu__init_system_ctx_switch();
} }
......
...@@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_regs *regs) ...@@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_regs *regs)
NOKPROBE_SYMBOL(default_do_nmi); NOKPROBE_SYMBOL(default_do_nmi);
/* /*
* NMIs can hit breakpoints which will cause it to lose its * NMIs can page fault or hit breakpoints which will cause it to lose
* NMI context with the CPU when the breakpoint does an iret. * its NMI context with the CPU when the breakpoint or page fault does an IRET.
*/ *
#ifdef CONFIG_X86_32 * As a result, NMIs can nest if NMIs get unmasked due an IRET during
/* * NMI processing. On x86_64, the asm glue protects us from nested NMIs
* For i386, NMIs use the same stack as the kernel, and we can * if the outer NMI came from kernel mode, but we can still nest if the
* add a workaround to the iret problem in C (preventing nested * outer NMI came from user mode.
* NMIs if an NMI takes a trap). Simply have 3 states the NMI *
* can be in: * To handle these nested NMIs, we have three states:
* *
* 1) not running * 1) not running
* 2) executing * 2) executing
...@@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi); ...@@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi);
* (Note, the latch is binary, thus multiple NMIs triggering, * (Note, the latch is binary, thus multiple NMIs triggering,
* when one is running, are ignored. Only one NMI is restarted.) * when one is running, are ignored. Only one NMI is restarted.)
* *
* If an NMI hits a breakpoint that executes an iret, another * If an NMI executes an iret, another NMI can preempt it. We do not
* NMI can preempt it. We do not want to allow this new NMI * want to allow this new NMI to run, but we want to execute it when the
* to run, but we want to execute it when the first one finishes. * first one finishes. We set the state to "latched", and the exit of
* We set the state to "latched", and the exit of the first NMI will * the first NMI will perform a dec_return, if the result is zero
* perform a dec_return, if the result is zero (NOT_RUNNING), then * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
* it will simply exit the NMI handler. If not, the dec_return * dec_return would have set the state to NMI_EXECUTING (what we want it
* would have set the state to NMI_EXECUTING (what we want it to * to be when we are running). In this case, we simply jump back to
* be when we are running). In this case, we simply jump back * rerun the NMI handler again, and restart the 'latched' NMI.
* to rerun the NMI handler again, and restart the 'latched' NMI.
* *
* No trap (breakpoint or page fault) should be hit before nmi_restart, * No trap (breakpoint or page fault) should be hit before nmi_restart,
* thus there is no race between the first check of state for NOT_RUNNING * thus there is no race between the first check of state for NOT_RUNNING
...@@ -461,49 +460,36 @@ enum nmi_states { ...@@ -461,49 +460,36 @@ enum nmi_states {
static DEFINE_PER_CPU(enum nmi_states, nmi_state); static DEFINE_PER_CPU(enum nmi_states, nmi_state);
static DEFINE_PER_CPU(unsigned long, nmi_cr2); static DEFINE_PER_CPU(unsigned long, nmi_cr2);
#define nmi_nesting_preprocess(regs) \ #ifdef CONFIG_X86_64
do { \
if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
this_cpu_write(nmi_state, NMI_LATCHED); \
return; \
} \
this_cpu_write(nmi_state, NMI_EXECUTING); \
this_cpu_write(nmi_cr2, read_cr2()); \
} while (0); \
nmi_restart:
#define nmi_nesting_postprocess() \
do { \
if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \
write_cr2(this_cpu_read(nmi_cr2)); \
if (this_cpu_dec_return(nmi_state)) \
goto nmi_restart; \
} while (0)
#else /* x86_64 */
/* /*
* In x86_64 things are a bit more difficult. This has the same problem * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without
* where an NMI hitting a breakpoint that calls iret will remove the * some care, the inner breakpoint will clobber the outer breakpoint's
* NMI context, allowing a nested NMI to enter. What makes this more * stack.
* difficult is that both NMIs and breakpoints have their own stack.
* When a new NMI or breakpoint is executed, the stack is set to a fixed
* point. If an NMI is nested, it will have its stack set at that same
* fixed address that the first NMI had, and will start corrupting the
* stack. This is handled in entry_64.S, but the same problem exists with
* the breakpoint stack.
* *
* If a breakpoint is being processed, and the debug stack is being used, * If a breakpoint is being processed, and the debug stack is being
* if an NMI comes in and also hits a breakpoint, the stack pointer * used, if an NMI comes in and also hits a breakpoint, the stack
* will be set to the same fixed address as the breakpoint that was * pointer will be set to the same fixed address as the breakpoint that
* interrupted, causing that stack to be corrupted. To handle this case, * was interrupted, causing that stack to be corrupted. To handle this
* check if the stack that was interrupted is the debug stack, and if * case, check if the stack that was interrupted is the debug stack, and
* so, change the IDT so that new breakpoints will use the current stack * if so, change the IDT so that new breakpoints will use the current
* and not switch to the fixed address. On return of the NMI, switch back * stack and not switch to the fixed address. On return of the NMI,
* to the original IDT. * switch back to the original IDT.
*/ */
static DEFINE_PER_CPU(int, update_debug_stack); static DEFINE_PER_CPU(int, update_debug_stack);
#endif
static inline void nmi_nesting_preprocess(struct pt_regs *regs) dotraplinkage notrace void
do_nmi(struct pt_regs *regs, long error_code)
{ {
if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
this_cpu_write(nmi_state, NMI_LATCHED);
return;
}
this_cpu_write(nmi_state, NMI_EXECUTING);
this_cpu_write(nmi_cr2, read_cr2());
nmi_restart:
#ifdef CONFIG_X86_64
/* /*
* If we interrupted a breakpoint, it is possible that * If we interrupted a breakpoint, it is possible that
* the nmi handler will have breakpoints too. We need to * the nmi handler will have breakpoints too. We need to
...@@ -514,22 +500,8 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs) ...@@ -514,22 +500,8 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
debug_stack_set_zero(); debug_stack_set_zero();
this_cpu_write(update_debug_stack, 1); this_cpu_write(update_debug_stack, 1);
} }
}
static inline void nmi_nesting_postprocess(void)
{
if (unlikely(this_cpu_read(update_debug_stack))) {
debug_stack_reset();
this_cpu_write(update_debug_stack, 0);
}
}
#endif #endif
dotraplinkage notrace void
do_nmi(struct pt_regs *regs, long error_code)
{
nmi_nesting_preprocess(regs);
nmi_enter(); nmi_enter();
inc_irq_stat(__nmi_count); inc_irq_stat(__nmi_count);
...@@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_code) ...@@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_code)
nmi_exit(); nmi_exit();
/* On i386, may loop back to preprocess */ #ifdef CONFIG_X86_64
nmi_nesting_postprocess(); if (unlikely(this_cpu_read(update_debug_stack))) {
debug_stack_reset();
this_cpu_write(update_debug_stack, 0);
}
#endif
if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
write_cr2(this_cpu_read(nmi_cr2));
if (this_cpu_dec_return(nmi_state))
goto nmi_restart;
} }
NOKPROBE_SYMBOL(do_nmi); NOKPROBE_SYMBOL(do_nmi);
......
...@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister); ...@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
*/ */
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{ {
*dst = *src; memcpy(dst, src, arch_task_struct_size);
return fpu__copy(&dst->thread.fpu, &src->thread.fpu); return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
} }
......
...@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) ...@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
roundup(sizeof(CORE_STR), 4)) + roundup(sizeof(CORE_STR), 4)) +
roundup(sizeof(struct elf_prstatus), 4) + roundup(sizeof(struct elf_prstatus), 4) +
roundup(sizeof(struct elf_prpsinfo), 4) + roundup(sizeof(struct elf_prpsinfo), 4) +
roundup(sizeof(struct task_struct), 4); roundup(arch_task_struct_size, 4);
*elf_buflen = PAGE_ALIGN(*elf_buflen); *elf_buflen = PAGE_ALIGN(*elf_buflen);
return size + *elf_buflen; return size + *elf_buflen;
} }
...@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) ...@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
/* set up the task structure */ /* set up the task structure */
notes[2].name = CORE_STR; notes[2].name = CORE_STR;
notes[2].type = NT_TASKSTRUCT; notes[2].type = NT_TASKSTRUCT;
notes[2].datasz = sizeof(struct task_struct); notes[2].datasz = arch_task_struct_size;
notes[2].data = current; notes[2].data = current;
nhdr->p_filesz += notesize(&notes[2]); nhdr->p_filesz += notesize(&notes[2]);
......
...@@ -1522,8 +1522,6 @@ struct task_struct { ...@@ -1522,8 +1522,6 @@ struct task_struct {
/* hung task detection */ /* hung task detection */
unsigned long last_switch_count; unsigned long last_switch_count;
#endif #endif
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */ /* filesystem information */
struct fs_struct *fs; struct fs_struct *fs;
/* open file information */ /* open file information */
...@@ -1778,8 +1776,22 @@ struct task_struct { ...@@ -1778,8 +1776,22 @@ struct task_struct {
unsigned long task_state_change; unsigned long task_state_change;
#endif #endif
int pagefault_disabled; int pagefault_disabled;
/* CPU-specific state of this task */
struct thread_struct thread;
/*
* WARNING: on x86, 'thread_struct' contains a variable-sized
* structure. It *MUST* be at the end of 'task_struct'.
*
* Do not put anything below here!
*/
}; };
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
extern int arch_task_struct_size __read_mostly;
#else
# define arch_task_struct_size (sizeof(struct task_struct))
#endif
/* Future-safe accessor for struct task_struct's cpus_allowed. */ /* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
......
...@@ -287,6 +287,11 @@ static void set_max_threads(unsigned int max_threads_suggested) ...@@ -287,6 +287,11 @@ static void set_max_threads(unsigned int max_threads_suggested)
max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
} }
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
/* Initialized by the architecture: */
int arch_task_struct_size __read_mostly;
#endif
void __init fork_init(void) void __init fork_init(void)
{ {
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
...@@ -295,7 +300,7 @@ void __init fork_init(void) ...@@ -295,7 +300,7 @@ void __init fork_init(void)
#endif #endif
/* create a slab on which task_structs can be allocated */ /* create a slab on which task_structs can be allocated */
task_struct_cachep = task_struct_cachep =
kmem_cache_create("task_struct", sizeof(struct task_struct), kmem_cache_create("task_struct", arch_task_struct_size,
ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
#endif #endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment