Commit ab851d49 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-iopl-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 iopl updates from Ingo Molnar:
 "This implements a nice simplification of the iopl and ioperm code that
  Thomas Gleixner discovered: we can implement the IO privilege features
  of the iopl system call by using the IO permission bitmap in
  permissive mode, while trapping CLI/STI/POPF/PUSHF uses in user-space
  if they change the interrupt flag.

  This implements that feature, with testing facilities and related
  cleanups"

[ "Simplification" may be an over-statement. The main goal is to avoid
  the cli/sti of iopl by effectively implementing the IO port access
  parts of iopl in terms of ioperm.

  This may end up not workign well in case people actually depend on
  cli/sti being available, or if there are mixed uses of iopl and
  ioperm. We will see..       - Linus ]

* 'x86-iopl-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (22 commits)
  x86/ioperm: Fix use of deprecated config option
  x86/entry/32: Clarify register saving in __switch_to_asm()
  selftests/x86/iopl: Extend test to cover IOPL emulation
  x86/ioperm: Extend IOPL config to control ioperm() as well
  x86/iopl: Remove legacy IOPL option
  x86/iopl: Restrict iopl() permission scope
  x86/iopl: Fixup misleading comment
  selftests/x86/ioperm: Extend testing so the shared bitmap is exercised
  x86/ioperm: Share I/O bitmap if identical
  x86/ioperm: Remove bitmap if all permissions dropped
  x86/ioperm: Move TSS bitmap update to exit to user work
  x86/ioperm: Add bitmap sequence number
  x86/ioperm: Move iobitmap data into a struct
  x86/tss: Move I/O bitmap data into a seperate struct
  x86/io: Speedup schedule out of I/O bitmap user
  x86/ioperm: Avoid bitmap allocation if no permissions are set
  x86/ioperm: Simplify first ioperm() invocation logic
  x86/iopl: Cleanup include maze
  x86/tss: Fix and move VMX BUILD_BUG_ON()
  x86/cpu: Unify cpu_init()
  ...
parents 1d872004 e3cb0c71
...@@ -1224,6 +1224,24 @@ config X86_VSYSCALL_EMULATION ...@@ -1224,6 +1224,24 @@ config X86_VSYSCALL_EMULATION
Disabling this option saves about 7K of kernel size and Disabling this option saves about 7K of kernel size and
possibly 4K of additional runtime pagetable memory. possibly 4K of additional runtime pagetable memory.
config X86_IOPL_IOPERM
bool "IOPERM and IOPL Emulation"
default y
---help---
This enables the ioperm() and iopl() syscalls which are necessary
for legacy applications.
Legacy IOPL support is an overbroad mechanism which allows user
space aside of accessing all 65536 I/O ports also to disable
interrupts. To gain this access the caller needs CAP_SYS_RAWIO
capabilities and permission from potentially active security
modules.
The emulation restricts the functionality of the syscall to
only allowing the full range I/O port access, but prevents the
ability to disable interrupts from user space which would be
granted if the hardware IOPL mechanism would be used.
config TOSHIBA config TOSHIBA
tristate "Toshiba Laptop support" tristate "Toshiba Laptop support"
depends on X86_32 depends on X86_32
......
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include <asm/cpufeature.h> #include <asm/cpufeature.h>
#include <asm/fpu/api.h> #include <asm/fpu/api.h>
#include <asm/nospec-branch.h> #include <asm/nospec-branch.h>
#include <asm/io_bitmap.h>
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h> #include <trace/events/syscalls.h>
...@@ -196,6 +197,9 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) ...@@ -196,6 +197,9 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
/* Reload ti->flags; we may have rescheduled above. */ /* Reload ti->flags; we may have rescheduled above. */
cached_flags = READ_ONCE(ti->flags); cached_flags = READ_ONCE(ti->flags);
if (unlikely(cached_flags & _TIF_IO_BITMAP))
tss_update_io_bitmap();
fpregs_assert_state_consistent(); fpregs_assert_state_consistent();
if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD)) if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD))
switch_fpu_return(); switch_fpu_return();
......
...@@ -739,6 +739,11 @@ SYM_CODE_START(__switch_to_asm) ...@@ -739,6 +739,11 @@ SYM_CODE_START(__switch_to_asm)
pushl %ebx pushl %ebx
pushl %edi pushl %edi
pushl %esi pushl %esi
/*
* Flags are saved to prevent AC leakage. This could go
* away if objtool would have 32bit support to verify
* the STAC/CLAC correctness.
*/
pushfl pushfl
/* switch stack */ /* switch stack */
...@@ -761,8 +766,9 @@ SYM_CODE_START(__switch_to_asm) ...@@ -761,8 +766,9 @@ SYM_CODE_START(__switch_to_asm)
FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
#endif #endif
/* restore callee-saved registers */ /* Restore flags or the incoming task to restore AC state. */
popfl popfl
/* restore callee-saved registers */
popl %esi popl %esi
popl %edi popl %edi
popl %ebx popl %ebx
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_IOBITMAP_H
#define _ASM_X86_IOBITMAP_H
#include <linux/refcount.h>
#include <asm/processor.h>
struct io_bitmap {
u64 sequence;
refcount_t refcnt;
/* The maximum number of bytes to copy so all zero bits are covered */
unsigned int max;
unsigned long bitmap[IO_BITMAP_LONGS];
};
struct task_struct;
#ifdef CONFIG_X86_IOPL_IOPERM
void io_bitmap_share(struct task_struct *tsk);
void io_bitmap_exit(void);
void tss_update_io_bitmap(void);
#else
static inline void io_bitmap_share(struct task_struct *tsk) { }
static inline void io_bitmap_exit(void) { }
static inline void tss_update_io_bitmap(void) { }
#endif
#endif
...@@ -294,10 +294,6 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) ...@@ -294,10 +294,6 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
{ {
PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g); PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g);
} }
static inline void set_iopl_mask(unsigned mask)
{
PVOP_VCALL1(cpu.set_iopl_mask, mask);
}
static inline void paravirt_activate_mm(struct mm_struct *prev, static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next) struct mm_struct *next)
......
...@@ -140,8 +140,6 @@ struct pv_cpu_ops { ...@@ -140,8 +140,6 @@ struct pv_cpu_ops {
void (*load_sp0)(unsigned long sp0); void (*load_sp0)(unsigned long sp0);
void (*set_iopl_mask)(unsigned mask);
void (*wbinvd)(void); void (*wbinvd)(void);
/* cpuid emulation, mostly so that caps bits can be disabled */ /* cpuid emulation, mostly so that caps bits can be disabled */
......
...@@ -44,7 +44,7 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ ...@@ -44,7 +44,7 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
* Define this here and validate with BUILD_BUG_ON() in pgtable_32.c * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
* to avoid include recursion hell * to avoid include recursion hell
*/ */
#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 39) #define CPU_ENTRY_AREA_PAGES (NR_CPUS * 41)
/* The +1 is for the readonly IDT page: */ /* The +1 is for the readonly IDT page: */
#define CPU_ENTRY_AREA_BASE \ #define CPU_ENTRY_AREA_BASE \
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
/* Forward declaration, a strange C thing */ /* Forward declaration, a strange C thing */
struct task_struct; struct task_struct;
struct mm_struct; struct mm_struct;
struct io_bitmap;
struct vm86; struct vm86;
#include <asm/math_emu.h> #include <asm/math_emu.h>
...@@ -336,10 +337,32 @@ struct x86_hw_tss { ...@@ -336,10 +337,32 @@ struct x86_hw_tss {
* IO-bitmap sizes: * IO-bitmap sizes:
*/ */
#define IO_BITMAP_BITS 65536 #define IO_BITMAP_BITS 65536
#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) #define IO_BITMAP_BYTES (IO_BITMAP_BITS / BITS_PER_BYTE)
#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) #define IO_BITMAP_LONGS (IO_BITMAP_BYTES / sizeof(long))
#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
#define INVALID_IO_BITMAP_OFFSET 0x8000 #define IO_BITMAP_OFFSET_VALID_MAP \
(offsetof(struct tss_struct, io_bitmap.bitmap) - \
offsetof(struct tss_struct, x86_tss))
#define IO_BITMAP_OFFSET_VALID_ALL \
(offsetof(struct tss_struct, io_bitmap.mapall) - \
offsetof(struct tss_struct, x86_tss))
#ifdef CONFIG_X86_IOPL_IOPERM
/*
* sizeof(unsigned long) coming from an extra "long" at the end of the
* iobitmap. The limit is inclusive, i.e. the last valid byte.
*/
# define __KERNEL_TSS_LIMIT \
(IO_BITMAP_OFFSET_VALID_ALL + IO_BITMAP_BYTES + \
sizeof(unsigned long) - 1)
#else
# define __KERNEL_TSS_LIMIT \
(offsetof(struct tss_struct, x86_tss) + sizeof(struct x86_hw_tss) - 1)
#endif
/* Base offset outside of TSS_LIMIT so unpriviledged IO causes #GP */
#define IO_BITMAP_OFFSET_INVALID (__KERNEL_TSS_LIMIT + 1)
struct entry_stack { struct entry_stack {
unsigned long words[64]; unsigned long words[64];
...@@ -349,13 +372,21 @@ struct entry_stack_page { ...@@ -349,13 +372,21 @@ struct entry_stack_page {
struct entry_stack stack; struct entry_stack stack;
} __aligned(PAGE_SIZE); } __aligned(PAGE_SIZE);
struct tss_struct { /*
* All IO bitmap related data stored in the TSS:
*/
struct x86_io_bitmap {
/* The sequence number of the last active bitmap. */
u64 prev_sequence;
/* /*
* The fixed hardware portion. This must not cross a page boundary * Store the dirty size of the last io bitmap offender. The next
* at risk of violating the SDM's advice and potentially triggering * one will have to do the cleanup as the switch out to a non io
* errata. * bitmap user will just set x86_tss.io_bitmap_base to a value
* outside of the TSS limit. So for sane tasks there is no need to
* actually touch the io_bitmap at all.
*/ */
struct x86_hw_tss x86_tss; unsigned int prev_max;
/* /*
* The extra 1 is there because the CPU will access an * The extra 1 is there because the CPU will access an
...@@ -363,20 +394,29 @@ struct tss_struct { ...@@ -363,20 +394,29 @@ struct tss_struct {
* bitmap. The extra byte must be all 1 bits, and must * bitmap. The extra byte must be all 1 bits, and must
* be within the limit. * be within the limit.
*/ */
unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; unsigned long bitmap[IO_BITMAP_LONGS + 1];
} __aligned(PAGE_SIZE);
DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); /*
* Special I/O bitmap to emulate IOPL(3). All bytes zero,
* except the additional byte at the end.
*/
unsigned long mapall[IO_BITMAP_LONGS + 1];
};
/* struct tss_struct {
* sizeof(unsigned long) coming from an extra "long" at the end /*
* of the iobitmap. * The fixed hardware portion. This must not cross a page boundary
* * at risk of violating the SDM's advice and potentially triggering
* -1? seg base+limit should be pointing to the address of the * errata.
* last valid byte
*/ */
#define __KERNEL_TSS_LIMIT \ struct x86_hw_tss x86_tss;
(IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1)
#ifdef CONFIG_X86_IOPL_IOPERM
struct x86_io_bitmap io_bitmap;
#endif
} __aligned(PAGE_SIZE);
DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
/* Per CPU interrupt stacks */ /* Per CPU interrupt stacks */
struct irq_stack { struct irq_stack {
...@@ -488,10 +528,14 @@ struct thread_struct { ...@@ -488,10 +528,14 @@ struct thread_struct {
struct vm86 *vm86; struct vm86 *vm86;
#endif #endif
/* IO permissions: */ /* IO permissions: */
unsigned long *io_bitmap_ptr; struct io_bitmap *io_bitmap;
unsigned long iopl;
/* Max allowed port in the bitmap, in bytes: */ /*
unsigned io_bitmap_max; * IOPL. Priviledge level dependent I/O permission which is
* emulated via the I/O bitmap to prevent user space from disabling
* interrupts.
*/
unsigned long iopl_emul;
mm_segment_t addr_limit; mm_segment_t addr_limit;
...@@ -523,25 +567,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, ...@@ -523,25 +567,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
*/ */
#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
/*
* Set IOPL bits in EFLAGS from given mask
*/
static inline void native_set_iopl_mask(unsigned mask)
{
#ifdef CONFIG_X86_32
unsigned int reg;
asm volatile ("pushfl;"
"popl %0;"
"andl %1, %0;"
"orl %2, %0;"
"pushl %0;"
"popfl"
: "=&r" (reg)
: "i" (~X86_EFLAGS_IOPL), "r" (mask));
#endif
}
static inline void static inline void
native_load_sp0(unsigned long sp0) native_load_sp0(unsigned long sp0)
{ {
...@@ -581,7 +606,6 @@ static inline void load_sp0(unsigned long sp0) ...@@ -581,7 +606,6 @@ static inline void load_sp0(unsigned long sp0)
native_load_sp0(sp0); native_load_sp0(sp0);
} }
#define set_iopl_mask native_set_iopl_mask
#endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_PARAVIRT_XXL */
/* Free all resources held by a thread. */ /* Free all resources held by a thread. */
...@@ -849,7 +873,6 @@ static inline void spin_lock_prefetch(const void *x) ...@@ -849,7 +873,6 @@ static inline void spin_lock_prefetch(const void *x)
#define INIT_THREAD { \ #define INIT_THREAD { \
.sp0 = TOP_OF_INIT_STACK, \ .sp0 = TOP_OF_INIT_STACK, \
.sysenter_cs = __KERNEL_CS, \ .sysenter_cs = __KERNEL_CS, \
.io_bitmap_ptr = NULL, \
.addr_limit = KERNEL_DS, \ .addr_limit = KERNEL_DS, \
} }
......
...@@ -361,5 +361,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx, ...@@ -361,5 +361,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
extern int do_set_thread_area(struct task_struct *p, int idx, extern int do_set_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info, int can_allocate); struct user_desc __user *info, int can_allocate);
#ifdef CONFIG_X86_64
# define do_set_thread_area_64(p, s, t) do_arch_prctl_64(p, s, t)
#else
# define do_set_thread_area_64(p, s, t) (0)
#endif
#endif /* !__ASSEMBLY__ */ #endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PTRACE_H */ #endif /* _ASM_X86_PTRACE_H */
...@@ -103,7 +103,17 @@ static inline void update_task_stack(struct task_struct *task) ...@@ -103,7 +103,17 @@ static inline void update_task_stack(struct task_struct *task)
if (static_cpu_has(X86_FEATURE_XENPV)) if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task)); load_sp0(task_top_of_stack(task));
#endif #endif
}
static inline void kthread_frame_init(struct inactive_task_frame *frame,
unsigned long fun, unsigned long arg)
{
frame->bx = fun;
#ifdef CONFIG_X86_32
frame->di = arg;
#else
frame->r12 = arg;
#endif
} }
#endif /* _ASM_X86_SWITCH_TO_H */ #endif /* _ASM_X86_SWITCH_TO_H */
...@@ -144,7 +144,7 @@ struct thread_info { ...@@ -144,7 +144,7 @@ struct thread_info {
/* flags to check in __switch_to() */ /* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW_BASE \ #define _TIF_WORK_CTXSW_BASE \
(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP| \ (_TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \
_TIF_SSBD | _TIF_SPEC_FORCE_UPDATE) _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
/* /*
...@@ -156,7 +156,13 @@ struct thread_info { ...@@ -156,7 +156,13 @@ struct thread_info {
# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE) # define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE)
#endif #endif
#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) #ifdef CONFIG_X86_IOPL_IOPERM
# define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW| _TIF_USER_RETURN_NOTIFY | \
_TIF_IO_BITMAP)
#else
# define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW| _TIF_USER_RETURN_NOTIFY)
#endif
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
#define STACK_WARN (THREAD_SIZE/8) #define STACK_WARN (THREAD_SIZE/8)
......
...@@ -62,6 +62,4 @@ void xen_arch_register_cpu(int num); ...@@ -62,6 +62,4 @@ void xen_arch_register_cpu(int num);
void xen_arch_unregister_cpu(int num); void xen_arch_unregister_cpu(int num);
#endif #endif
extern void xen_set_iopl_mask(unsigned mask);
#endif /* _ASM_X86_XEN_HYPERVISOR_H */ #endif /* _ASM_X86_XEN_HYPERVISOR_H */
...@@ -53,10 +53,7 @@ ...@@ -53,10 +53,7 @@
#include <asm/microcode_intel.h> #include <asm/microcode_intel.h>
#include <asm/intel-family.h> #include <asm/intel-family.h>
#include <asm/cpu_device_id.h> #include <asm/cpu_device_id.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/uv/uv.h> #include <asm/uv/uv.h>
#endif
#include "cpu.h" #include "cpu.h"
...@@ -1781,7 +1778,7 @@ static void wait_for_master_cpu(int cpu) ...@@ -1781,7 +1778,7 @@ static void wait_for_master_cpu(int cpu)
} }
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
static void setup_getcpu(int cpu) static inline void setup_getcpu(int cpu)
{ {
unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
struct desc_struct d = { }; struct desc_struct d = { };
...@@ -1801,7 +1798,59 @@ static void setup_getcpu(int cpu) ...@@ -1801,7 +1798,59 @@ static void setup_getcpu(int cpu)
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S); write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
} }
static inline void ucode_cpu_init(int cpu)
{
if (cpu)
load_ucode_ap();
}
static inline void tss_setup_ist(struct tss_struct *tss)
{
/* Set up the per-CPU TSS IST stacks */
tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
}
static inline void gdt_setup_doublefault_tss(int cpu) { }
#else /* CONFIG_X86_64 */
static inline void setup_getcpu(int cpu) { }
static inline void ucode_cpu_init(int cpu)
{
show_ucode_info_early();
}
static inline void tss_setup_ist(struct tss_struct *tss) { }
static inline void gdt_setup_doublefault_tss(int cpu)
{
#ifdef CONFIG_DOUBLEFAULT
/* Set up the doublefault TSS pointer in the GDT */
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
#endif #endif
}
#endif /* !CONFIG_X86_64 */
static inline void tss_setup_io_bitmap(struct tss_struct *tss)
{
tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
#ifdef CONFIG_X86_IOPL_IOPERM
tss->io_bitmap.prev_max = 0;
tss->io_bitmap.prev_sequence = 0;
memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap));
/*
* Invalidate the extra array entry past the end of the all
* permission bitmap as required by the hardware.
*/
tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL;
#endif
}
/* /*
* cpu_init() initializes state that is per-CPU. Some data is already * cpu_init() initializes state that is per-CPU. Some data is already
...@@ -1809,21 +1858,15 @@ static void setup_getcpu(int cpu) ...@@ -1809,21 +1858,15 @@ static void setup_getcpu(int cpu)
* and IDT. We reload them nevertheless, this function acts as a * and IDT. We reload them nevertheless, this function acts as a
* 'CPU state barrier', nothing should get across. * 'CPU state barrier', nothing should get across.
*/ */
#ifdef CONFIG_X86_64
void cpu_init(void) void cpu_init(void)
{ {
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
struct task_struct *cur = current;
int cpu = raw_smp_processor_id(); int cpu = raw_smp_processor_id();
struct task_struct *me;
struct tss_struct *t;
int i;
wait_for_master_cpu(cpu); wait_for_master_cpu(cpu);
if (cpu) ucode_cpu_init(cpu);
load_ucode_ap();
t = &per_cpu(cpu_tss_rw, cpu);
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
if (this_cpu_read(numa_node) == 0 && if (this_cpu_read(numa_node) == 0 &&
...@@ -1832,63 +1875,47 @@ void cpu_init(void) ...@@ -1832,63 +1875,47 @@ void cpu_init(void)
#endif #endif
setup_getcpu(cpu); setup_getcpu(cpu);
me = current;
pr_debug("Initializing CPU#%d\n", cpu); pr_debug("Initializing CPU#%d\n", cpu);
if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) ||
boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE))
cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
/* /*
* Initialize the per-CPU GDT with the boot GDT, * Initialize the per-CPU GDT with the boot GDT,
* and set up the GDT descriptor: * and set up the GDT descriptor:
*/ */
switch_to_new_gdt(cpu); switch_to_new_gdt(cpu);
loadsegment(fs, 0);
load_current_idt(); load_current_idt();
memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); if (IS_ENABLED(CONFIG_X86_64)) {
loadsegment(fs, 0);
memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
syscall_init(); syscall_init();
wrmsrl(MSR_FS_BASE, 0); wrmsrl(MSR_FS_BASE, 0);
wrmsrl(MSR_KERNEL_GS_BASE, 0); wrmsrl(MSR_KERNEL_GS_BASE, 0);
barrier(); barrier();
x86_configure_nx();
x2apic_setup(); x2apic_setup();
/*
* set up and load the per-CPU TSS
*/
if (!t->x86_tss.ist[0]) {
t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
} }
t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
/*
* <= is required because the CPU will access up to
* 8 bits beyond the end of the IO permission bitmap.
*/
for (i = 0; i <= IO_BITMAP_LONGS; i++)
t->io_bitmap[i] = ~0UL;
mmgrab(&init_mm); mmgrab(&init_mm);
me->active_mm = &init_mm; cur->active_mm = &init_mm;
BUG_ON(me->mm); BUG_ON(cur->mm);
initialize_tlbstate_and_flush(); initialize_tlbstate_and_flush();
enter_lazy_tlb(&init_mm, me); enter_lazy_tlb(&init_mm, cur);
/* /* Initialize the TSS. */
* Initialize the TSS. sp0 points to the entry trampoline stack tss_setup_ist(tss);
* regardless of what task is running. tss_setup_io_bitmap(tss);
*/
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc(); load_TR_desc();
/*
* sp0 points to the entry trampoline stack regardless of what task
* is running.
*/
load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
load_mm_ldt(&init_mm); load_mm_ldt(&init_mm);
...@@ -1896,6 +1923,8 @@ void cpu_init(void) ...@@ -1896,6 +1923,8 @@ void cpu_init(void)
clear_all_debug_regs(); clear_all_debug_regs();
dbg_restore_debug_regs(); dbg_restore_debug_regs();
gdt_setup_doublefault_tss(cpu);
fpu__init_cpu(); fpu__init_cpu();
if (is_uv_system()) if (is_uv_system())
...@@ -1904,63 +1933,6 @@ void cpu_init(void) ...@@ -1904,63 +1933,6 @@ void cpu_init(void)
load_fixmap_gdt(cpu); load_fixmap_gdt(cpu);
} }
#else
void cpu_init(void)
{
int cpu = smp_processor_id();
struct task_struct *curr = current;
struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
wait_for_master_cpu(cpu);
show_ucode_info_early();
pr_info("Initializing CPU#%d\n", cpu);
if (cpu_feature_enabled(X86_FEATURE_VME) ||
boot_cpu_has(X86_FEATURE_TSC) ||
boot_cpu_has(X86_FEATURE_DE))
cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
load_current_idt();
switch_to_new_gdt(cpu);
/*
* Set up and load the per-CPU TSS and LDT
*/
mmgrab(&init_mm);
curr->active_mm = &init_mm;
BUG_ON(curr->mm);
initialize_tlbstate_and_flush();
enter_lazy_tlb(&init_mm, curr);
/*
* Initialize the TSS. sp0 points to the entry trampoline stack
* regardless of what task is running.
*/
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
load_mm_ldt(&init_mm);
t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
#ifdef CONFIG_DOUBLEFAULT
/* Set up doublefault TSS pointer in the GDT */
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
#endif
clear_all_debug_regs();
dbg_restore_debug_regs();
fpu__init_cpu();
load_fixmap_gdt(cpu);
}
#endif
/* /*
* The microcode loader calls this upon late microcode load to recheck features, * The microcode loader calls this upon late microcode load to recheck features,
* only when microcode has been updated. Caller holds microcode_mutex and CPU * only when microcode has been updated. Caller holds microcode_mutex and CPU
......
...@@ -54,7 +54,7 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = { ...@@ -54,7 +54,7 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = {
.sp0 = STACK_START, .sp0 = STACK_START,
.ss0 = __KERNEL_DS, .ss0 = __KERNEL_DS,
.ldt = 0, .ldt = 0,
.io_bitmap_base = INVALID_IO_BITMAP_OFFSET, .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
.ip = (unsigned long) doublefault_fn, .ip = (unsigned long) doublefault_fn,
/* 0x2 bit is always set */ /* 0x2 bit is always set */
......
...@@ -3,32 +3,69 @@ ...@@ -3,32 +3,69 @@
* This contains the io-permission bitmap code - written by obz, with changes * This contains the io-permission bitmap code - written by obz, with changes
* by Linus. 32/64 bits code unification by Miguel Botón. * by Linus. 32/64 bits code unification by Miguel Botón.
*/ */
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/kernel.h>
#include <linux/capability.h> #include <linux/capability.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/ioport.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/smp.h>
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/thread_info.h>
#include <linux/syscalls.h> #include <linux/syscalls.h>
#include <linux/bitmap.h> #include <linux/bitmap.h>
#include <asm/syscalls.h> #include <linux/ioport.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <asm/io_bitmap.h>
#include <asm/desc.h> #include <asm/desc.h>
#ifdef CONFIG_X86_IOPL_IOPERM
static atomic64_t io_bitmap_sequence;
void io_bitmap_share(struct task_struct *tsk)
{
/* Can be NULL when current->thread.iopl_emul == 3 */
if (current->thread.io_bitmap) {
/*
* Take a refcount on current's bitmap. It can be used by
* both tasks as long as none of them changes the bitmap.
*/
refcount_inc(&current->thread.io_bitmap->refcnt);
tsk->thread.io_bitmap = current->thread.io_bitmap;
}
set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
}
static void task_update_io_bitmap(void)
{
struct thread_struct *t = &current->thread;
if (t->iopl_emul == 3 || t->io_bitmap) {
/* TSS update is handled on exit to user space */
set_thread_flag(TIF_IO_BITMAP);
} else {
clear_thread_flag(TIF_IO_BITMAP);
/* Invalidate TSS */
preempt_disable();
tss_update_io_bitmap();
preempt_enable();
}
}
void io_bitmap_exit(void)
{
struct io_bitmap *iobm = current->thread.io_bitmap;
current->thread.io_bitmap = NULL;
task_update_io_bitmap();
if (iobm && refcount_dec_and_test(&iobm->refcnt))
kfree(iobm);
}
/* /*
* this changes the io permissions bitmap in the current task. * This changes the io permissions bitmap in the current task.
*/ */
long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
{ {
struct thread_struct *t = &current->thread; struct thread_struct *t = &current->thread;
struct tss_struct *tss; unsigned int i, max_long;
unsigned int i, max_long, bytes, bytes_updated; struct io_bitmap *iobm;
if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
return -EINVAL; return -EINVAL;
...@@ -41,59 +78,72 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) ...@@ -41,59 +78,72 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
* IO bitmap up. ioperm() is much less timing critical than clone(), * IO bitmap up. ioperm() is much less timing critical than clone(),
* this is why we delay this operation until now: * this is why we delay this operation until now:
*/ */
if (!t->io_bitmap_ptr) { iobm = t->io_bitmap;
unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!iobm) {
/* No point to allocate a bitmap just to clear permissions */
if (!bitmap) if (!turn_on)
return 0;
iobm = kmalloc(sizeof(*iobm), GFP_KERNEL);
if (!iobm)
return -ENOMEM; return -ENOMEM;
memset(bitmap, 0xff, IO_BITMAP_BYTES); memset(iobm->bitmap, 0xff, sizeof(iobm->bitmap));
t->io_bitmap_ptr = bitmap; refcount_set(&iobm->refcnt, 1);
set_thread_flag(TIF_IO_BITMAP); }
/* /*
* Now that we have an IO bitmap, we need our TSS limit to be * If the bitmap is not shared, then nothing can take a refcount as
* correct. It's fine if we are preempted after doing this: * current can obviously not fork at the same time. If it's shared
* with TIF_IO_BITMAP set, context switches will keep our TSS * duplicate it and drop the refcount on the original one.
* limit correct.
*/ */
preempt_disable(); if (refcount_read(&iobm->refcnt) > 1) {
refresh_tss_limit(); iobm = kmemdup(iobm, sizeof(*iobm), GFP_KERNEL);
preempt_enable(); if (!iobm)
return -ENOMEM;
refcount_set(&iobm->refcnt, 1);
io_bitmap_exit();
} }
/* /*
* do it in the per-thread copy and in the TSS ... * Store the bitmap pointer (might be the same if the task already
* * head one). Must be done here so freeing the bitmap when all
* Disable preemption via get_cpu() - we must not switch away * permissions are dropped has the pointer set up.
* because the ->io_bitmap_max value must match the bitmap
* contents:
*/ */
tss = &per_cpu(cpu_tss_rw, get_cpu()); t->io_bitmap = iobm;
/* Mark it active for context switching and exit to user mode */
set_thread_flag(TIF_IO_BITMAP);
/*
* Update the tasks bitmap. The update of the TSS bitmap happens on
* exit to user mode. So this needs no protection.
*/
if (turn_on) if (turn_on)
bitmap_clear(t->io_bitmap_ptr, from, num); bitmap_clear(iobm->bitmap, from, num);
else else
bitmap_set(t->io_bitmap_ptr, from, num); bitmap_set(iobm->bitmap, from, num);
/* /*
* Search for a (possibly new) maximum. This is simple and stupid, * Search for a (possibly new) maximum. This is simple and stupid,
* to keep it obviously correct: * to keep it obviously correct:
*/ */
max_long = 0; max_long = UINT_MAX;
for (i = 0; i < IO_BITMAP_LONGS; i++) for (i = 0; i < IO_BITMAP_LONGS; i++) {
if (t->io_bitmap_ptr[i] != ~0UL) if (iobm->bitmap[i] != ~0UL)
max_long = i; max_long = i;
}
/* All permissions dropped? */
if (max_long == UINT_MAX) {
io_bitmap_exit();
return 0;
}
bytes = (max_long + 1) * sizeof(unsigned long); iobm->max = (max_long + 1) * sizeof(unsigned long);
bytes_updated = max(bytes, t->io_bitmap_max);
t->io_bitmap_max = bytes;
/* Update the TSS: */
memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
put_cpu(); /*
* Update the sequence number to force a TSS update on return to
* user mode.
*/
iobm->sequence = atomic64_add_return(1, &io_bitmap_sequence);
return 0; return 0;
} }
...@@ -104,38 +154,61 @@ SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on) ...@@ -104,38 +154,61 @@ SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
} }
/* /*
* sys_iopl has to be used when you want to access the IO ports * The sys_iopl functionality depends on the level argument, which if
* beyond the 0x3ff range: to get the full 65536 ports bitmapped * granted for the task is used to enable access to all 65536 I/O ports.
* you'd need 8kB of bitmaps/process, which is a bit excessive. *
* This does not use the IOPL mechanism provided by the CPU as that would
* also allow the user space task to use the CLI/STI instructions.
*
* Disabling interrupts in a user space task is dangerous as it might lock
* up the machine and the semantics vs. syscalls and exceptions is
* undefined.
* *
* Here we just change the flags value on the stack: we allow * Setting IOPL to level 0-2 is disabling I/O permissions. Level 3
* only the super-user to do it. This depends on the stack-layout * 3 enables them.
* on system-call entry - see also fork() and the signal handling *
* code. * IOPL is strictly per thread and inherited on fork.
*/ */
SYSCALL_DEFINE1(iopl, unsigned int, level) SYSCALL_DEFINE1(iopl, unsigned int, level)
{ {
struct pt_regs *regs = current_pt_regs();
struct thread_struct *t = &current->thread; struct thread_struct *t = &current->thread;
unsigned int old;
/*
* Careful: the IOPL bits in regs->flags are undefined under Xen PV
* and changing them has no effect.
*/
unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT;
if (level > 3) if (level > 3)
return -EINVAL; return -EINVAL;
old = t->iopl_emul;
/* No point in going further if nothing changes */
if (level == old)
return 0;
/* Trying to gain more privileges? */ /* Trying to gain more privileges? */
if (level > old) { if (level > old) {
if (!capable(CAP_SYS_RAWIO) || if (!capable(CAP_SYS_RAWIO) ||
security_locked_down(LOCKDOWN_IOPORT)) security_locked_down(LOCKDOWN_IOPORT))
return -EPERM; return -EPERM;
} }
regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
(level << X86_EFLAGS_IOPL_BIT); t->iopl_emul = level;
t->iopl = level << X86_EFLAGS_IOPL_BIT; task_update_io_bitmap();
set_iopl_mask(t->iopl);
return 0; return 0;
} }
#else /* CONFIG_X86_IOPL_IOPERM */
long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
{
return -ENOSYS;
}
SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
{
return -ENOSYS;
}
SYSCALL_DEFINE1(iopl, unsigned int, level)
{
return -ENOSYS;
}
#endif
...@@ -341,8 +341,6 @@ struct paravirt_patch_template pv_ops = { ...@@ -341,8 +341,6 @@ struct paravirt_patch_template pv_ops = {
.cpu.iret = native_iret, .cpu.iret = native_iret,
.cpu.swapgs = native_swapgs, .cpu.swapgs = native_swapgs,
.cpu.set_iopl_mask = native_set_iopl_mask,
.cpu.start_context_switch = paravirt_nop, .cpu.start_context_switch = paravirt_nop,
.cpu.end_context_switch = paravirt_nop, .cpu.end_context_switch = paravirt_nop,
......
...@@ -41,6 +41,7 @@ ...@@ -41,6 +41,7 @@
#include <asm/desc.h> #include <asm/desc.h>
#include <asm/prctl.h> #include <asm/prctl.h>
#include <asm/spec-ctrl.h> #include <asm/spec-ctrl.h>
#include <asm/io_bitmap.h>
#include <asm/proto.h> #include <asm/proto.h>
#include "process.h" #include "process.h"
...@@ -72,18 +73,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { ...@@ -72,18 +73,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS, .ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS, .ss1 = __KERNEL_CS,
.io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
#endif #endif
.io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
}, },
#ifdef CONFIG_X86_32
/*
* Note that the .io_bitmap member must be extra-big. This is because
* the CPU will access an additional byte beyond the end of the IO
* permission bitmap. The extra byte must be all 1 bits, and must
* be within the limit.
*/
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
#endif
}; };
EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
...@@ -110,26 +102,87 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) ...@@ -110,26 +102,87 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
void exit_thread(struct task_struct *tsk) void exit_thread(struct task_struct *tsk)
{ {
struct thread_struct *t = &tsk->thread; struct thread_struct *t = &tsk->thread;
unsigned long *bp = t->io_bitmap_ptr;
struct fpu *fpu = &t->fpu; struct fpu *fpu = &t->fpu;
if (bp) { if (test_thread_flag(TIF_IO_BITMAP))
struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu()); io_bitmap_exit();
free_vm86(t);
fpu__drop(fpu);
}
static int set_new_tls(struct task_struct *p, unsigned long tls)
{
struct user_desc __user *utls = (struct user_desc __user *)tls;
t->io_bitmap_ptr = NULL; if (in_ia32_syscall())
clear_thread_flag(TIF_IO_BITMAP); return do_set_thread_area(p, -1, utls, 0);
else
return do_set_thread_area_64(p, ARCH_SET_FS, tls);
}
int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
struct inactive_task_frame *frame;
struct fork_frame *fork_frame;
struct pt_regs *childregs;
int ret = 0;
childregs = task_pt_regs(p);
fork_frame = container_of(childregs, struct fork_frame, regs);
frame = &fork_frame->frame;
frame->bp = 0;
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
#ifdef CONFIG_X86_64
savesegment(gs, p->thread.gsindex);
p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase;
savesegment(fs, p->thread.fsindex);
p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase;
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
#else
p->thread.sp0 = (unsigned long) (childregs + 1);
/* /*
* Careful, clear this in the TSS too: * Clear all status flags including IF and set fixed bit. 64bit
* does not have this initialization as the frame does not contain
* flags. The flags consistency (especially vs. AC) is there
* ensured via objtool, which lacks 32bit support.
*/ */
memset(tss->io_bitmap, 0xff, t->io_bitmap_max); frame->flags = X86_EFLAGS_FIXED;
t->io_bitmap_max = 0; #endif
put_cpu();
kfree(bp); /* Kernel thread ? */
if (unlikely(p->flags & PF_KTHREAD)) {
memset(childregs, 0, sizeof(struct pt_regs));
kthread_frame_init(frame, sp, arg);
return 0;
} }
free_vm86(t); frame->bx = 0;
*childregs = *current_pt_regs();
childregs->ax = 0;
if (sp)
childregs->sp = sp;
fpu__drop(fpu); #ifdef CONFIG_X86_32
task_user_gs(p) = get_user_gs(current_pt_regs());
#endif
/* Set a new TLS for the child thread? */
if (clone_flags & CLONE_SETTLS)
ret = set_new_tls(p, tls);
if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
io_bitmap_share(p);
return ret;
} }
void flush_thread(void) void flush_thread(void)
...@@ -269,31 +322,96 @@ void arch_setup_new_exec(void) ...@@ -269,31 +322,96 @@ void arch_setup_new_exec(void)
} }
} }
static inline void switch_to_bitmap(struct thread_struct *prev, #ifdef CONFIG_X86_IOPL_IOPERM
struct thread_struct *next, static inline void tss_invalidate_io_bitmap(struct tss_struct *tss)
unsigned long tifp, unsigned long tifn)
{ {
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); /*
* Invalidate the I/O bitmap by moving io_bitmap_base outside the
* TSS limit so any subsequent I/O access from user space will
* trigger a #GP.
*
* This is correct even when VMEXIT rewrites the TSS limit
* to 0x67 as the only requirement is that the base points
* outside the limit.
*/
tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
}
if (tifn & _TIF_IO_BITMAP) { static inline void switch_to_bitmap(unsigned long tifp)
{
/* /*
* Copy the relevant range of the IO bitmap. * Invalidate I/O bitmap if the previous task used it. This prevents
* Normally this is 128 bytes or less: * any possible leakage of an active I/O bitmap.
*
* If the next task has an I/O bitmap it will handle it on exit to
* user mode.
*/ */
memcpy(tss->io_bitmap, next->io_bitmap_ptr, if (tifp & _TIF_IO_BITMAP)
max(prev->io_bitmap_max, next->io_bitmap_max)); tss_invalidate_io_bitmap(this_cpu_ptr(&cpu_tss_rw));
}
static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
{
/* /*
* Make sure that the TSS limit is correct for the CPU * Copy at least the byte range of the incoming tasks bitmap which
* to notice the IO bitmap. * covers the permitted I/O ports.
*
* If the previous task which used an I/O bitmap had more bits
* permitted, then the copy needs to cover those as well so they
* get turned off.
*/ */
refresh_tss_limit(); memcpy(tss->io_bitmap.bitmap, iobm->bitmap,
} else if (tifp & _TIF_IO_BITMAP) { max(tss->io_bitmap.prev_max, iobm->max));
/* /*
* Clear any possible leftover bits: * Store the new max and the sequence number of this bitmap
* and a pointer to the bitmap itself.
*/ */
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); tss->io_bitmap.prev_max = iobm->max;
tss->io_bitmap.prev_sequence = iobm->sequence;
}
/**
* tss_update_io_bitmap - Update I/O bitmap before exiting to usermode
*/
void tss_update_io_bitmap(void)
{
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
u16 *base = &tss->x86_tss.io_bitmap_base;
if (test_thread_flag(TIF_IO_BITMAP)) {
struct thread_struct *t = &current->thread;
if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
*base = IO_BITMAP_OFFSET_VALID_ALL;
} else {
struct io_bitmap *iobm = t->io_bitmap;
/*
* Only copy bitmap data when the sequence number
* differs. The update time is accounted to the
* incoming task.
*/
if (tss->io_bitmap.prev_sequence != iobm->sequence)
tss_copy_io_bitmap(tss, iobm);
/* Enable the bitmap */
*base = IO_BITMAP_OFFSET_VALID_MAP;
}
/*
* Make sure that the TSS limit is covering the io bitmap.
* It might have been cut down by a VMEXIT to 0x67 which
* would cause a subsequent I/O access from user space to
* trigger a #GP because tbe bitmap is outside the TSS
* limit.
*/
refresh_tss_limit();
} else {
tss_invalidate_io_bitmap(tss);
} }
} }
#else /* CONFIG_X86_IOPL_IOPERM */
static inline void switch_to_bitmap(unsigned long tifp) { }
#endif
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
...@@ -505,7 +623,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) ...@@ -505,7 +623,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
tifn = READ_ONCE(task_thread_info(next_p)->flags); tifn = READ_ONCE(task_thread_info(next_p)->flags);
tifp = READ_ONCE(task_thread_info(prev_p)->flags); tifp = READ_ONCE(task_thread_info(prev_p)->flags);
switch_to_bitmap(prev, next, tifp, tifn);
switch_to_bitmap(tifp);
propagate_user_return_notify(prev_p, next_p); propagate_user_return_notify(prev_p, next_p);
......
...@@ -112,74 +112,6 @@ void release_thread(struct task_struct *dead_task) ...@@ -112,74 +112,6 @@ void release_thread(struct task_struct *dead_task)
release_vm86_irqs(dead_task); release_vm86_irqs(dead_task);
} }
int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
struct pt_regs *childregs = task_pt_regs(p);
struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs);
struct inactive_task_frame *frame = &fork_frame->frame;
struct task_struct *tsk;
int err;
/*
* For a new task use the RESET flags value since there is no before.
* All the status flags are zero; DF and all the system flags must also
* be 0, specifically IF must be 0 because we context switch to the new
* task with interrupts disabled.
*/
frame->flags = X86_EFLAGS_FIXED;
frame->bp = 0;
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
p->thread.sp0 = (unsigned long) (childregs+1);
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
if (unlikely(p->flags & PF_KTHREAD)) {
/* kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
frame->bx = sp; /* function */
frame->di = arg;
p->thread.io_bitmap_ptr = NULL;
return 0;
}
frame->bx = 0;
*childregs = *current_pt_regs();
childregs->ax = 0;
if (sp)
childregs->sp = sp;
task_user_gs(p) = get_user_gs(current_pt_regs());
p->thread.io_bitmap_ptr = NULL;
tsk = current;
err = -ENOMEM;
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
p->thread.io_bitmap_max = 0;
return -ENOMEM;
}
set_tsk_thread_flag(p, TIF_IO_BITMAP);
}
err = 0;
/*
* Set a new TLS for the child thread?
*/
if (clone_flags & CLONE_SETTLS)
err = do_set_thread_area(p, -1,
(struct user_desc __user *)tls, 0);
if (err && p->thread.io_bitmap_ptr) {
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0;
}
return err;
}
void void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{ {
...@@ -255,15 +187,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) ...@@ -255,15 +187,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/ */
load_TLS(next, cpu); load_TLS(next, cpu);
/*
* Restore IOPL if needed. In normal use, the flags restore
* in the switch assembly will handle this. But if the kernel
* is running virtualized at a non-zero CPL, the popf will
* not restore flags, so it must be done in a separate step.
*/
if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
set_iopl_mask(next->iopl);
switch_to_extra(prev_p, next_p); switch_to_extra(prev_p, next_p);
/* /*
......
...@@ -371,81 +371,6 @@ void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) ...@@ -371,81 +371,6 @@ void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
task->thread.gsbase = gsbase; task->thread.gsbase = gsbase;
} }
int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
int err;
struct pt_regs *childregs;
struct fork_frame *fork_frame;
struct inactive_task_frame *frame;
struct task_struct *me = current;
childregs = task_pt_regs(p);
fork_frame = container_of(childregs, struct fork_frame, regs);
frame = &fork_frame->frame;
frame->bp = 0;
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap_ptr = NULL;
savesegment(gs, p->thread.gsindex);
p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
savesegment(fs, p->thread.fsindex);
p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
if (unlikely(p->flags & PF_KTHREAD)) {
/* kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
frame->bx = sp; /* function */
frame->r12 = arg;
return 0;
}
frame->bx = 0;
*childregs = *current_pt_regs();
childregs->ax = 0;
if (sp)
childregs->sp = sp;
err = -ENOMEM;
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
p->thread.io_bitmap_max = 0;
return -ENOMEM;
}
set_tsk_thread_flag(p, TIF_IO_BITMAP);
}
/*
* Set a new TLS for the child thread?
*/
if (clone_flags & CLONE_SETTLS) {
#ifdef CONFIG_IA32_EMULATION
if (in_ia32_syscall())
err = do_set_thread_area(p, -1,
(struct user_desc __user *)tls, 0);
else
#endif
err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
if (err)
goto out;
}
err = 0;
out:
if (err && p->thread.io_bitmap_ptr) {
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0;
}
return err;
}
static void static void
start_thread_common(struct pt_regs *regs, unsigned long new_ip, start_thread_common(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp, unsigned long new_sp,
...@@ -572,17 +497,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) ...@@ -572,17 +497,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
switch_to_extra(prev_p, next_p); switch_to_extra(prev_p, next_p);
#ifdef CONFIG_XEN_PV
/*
* On Xen PV, IOPL bits in pt_regs->flags have no effect, and
* current_pt_regs()->flags may not match the current task's
* intended IOPL. We need to switch it manually.
*/
if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
prev->iopl != next->iopl))
xen_set_iopl_mask(next->iopl);
#endif
if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
/* /*
* AMD CPUs have a misfeature: SYSRET sets the SS selector but * AMD CPUs have a misfeature: SYSRET sets the SS selector but
......
...@@ -42,6 +42,7 @@ ...@@ -42,6 +42,7 @@
#include <asm/traps.h> #include <asm/traps.h>
#include <asm/syscall.h> #include <asm/syscall.h>
#include <asm/fsgsbase.h> #include <asm/fsgsbase.h>
#include <asm/io_bitmap.h>
#include "tls.h" #include "tls.h"
...@@ -697,7 +698,9 @@ static int ptrace_set_debugreg(struct task_struct *tsk, int n, ...@@ -697,7 +698,9 @@ static int ptrace_set_debugreg(struct task_struct *tsk, int n,
static int ioperm_active(struct task_struct *target, static int ioperm_active(struct task_struct *target,
const struct user_regset *regset) const struct user_regset *regset)
{ {
return target->thread.io_bitmap_max / regset->size; struct io_bitmap *iobm = target->thread.io_bitmap;
return iobm ? DIV_ROUND_UP(iobm->max, regset->size) : 0;
} }
static int ioperm_get(struct task_struct *target, static int ioperm_get(struct task_struct *target,
...@@ -705,12 +708,13 @@ static int ioperm_get(struct task_struct *target, ...@@ -705,12 +708,13 @@ static int ioperm_get(struct task_struct *target,
unsigned int pos, unsigned int count, unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf) void *kbuf, void __user *ubuf)
{ {
if (!target->thread.io_bitmap_ptr) struct io_bitmap *iobm = target->thread.io_bitmap;
if (!iobm)
return -ENXIO; return -ENXIO;
return user_regset_copyout(&pos, &count, &kbuf, &ubuf, return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
target->thread.io_bitmap_ptr, iobm->bitmap, 0, IO_BITMAP_BYTES);
0, IO_BITMAP_BYTES);
} }
/* /*
......
...@@ -1368,14 +1368,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) ...@@ -1368,14 +1368,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
/*
* VM exits change the host TR limit to 0x67 after a VM
* exit. This is okay, since 0x67 covers everything except
* the IO bitmap and have have code to handle the IO bitmap
* being lost after a VM exit.
*/
BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
......
...@@ -161,6 +161,14 @@ static void __init setup_cpu_entry_area(unsigned int cpu) ...@@ -161,6 +161,14 @@ static void __init setup_cpu_entry_area(unsigned int cpu)
BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
/*
* VMX changes the host TR limit to 0x67 after a VM exit. This is
* okay, since 0x67 covers the size of struct x86_hw_tss. Make sure
* that this is correct.
*/
BUILD_BUG_ON(offsetof(struct tss_struct, x86_tss) != 0);
BUILD_BUG_ON(sizeof(struct x86_hw_tss) != 0x68);
cea_map_percpu_pages(&cea->tss, &per_cpu(cpu_tss_rw, cpu), cea_map_percpu_pages(&cea->tss, &per_cpu(cpu_tss_rw, cpu),
sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
......
...@@ -837,15 +837,6 @@ static void xen_load_sp0(unsigned long sp0) ...@@ -837,15 +837,6 @@ static void xen_load_sp0(unsigned long sp0)
this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
} }
void xen_set_iopl_mask(unsigned mask)
{
struct physdev_set_iopl set_iopl;
/* Force the change at ring 0. */
set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
}
static void xen_io_delay(void) static void xen_io_delay(void)
{ {
} }
...@@ -1055,7 +1046,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { ...@@ -1055,7 +1046,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.write_idt_entry = xen_write_idt_entry, .write_idt_entry = xen_write_idt_entry,
.load_sp0 = xen_load_sp0, .load_sp0 = xen_load_sp0,
.set_iopl_mask = xen_set_iopl_mask,
.io_delay = xen_io_delay, .io_delay = xen_io_delay,
/* Xen takes care of %gs when switching to usermode for us */ /* Xen takes care of %gs when switching to usermode for us */
......
...@@ -131,6 +131,17 @@ int main(void) ...@@ -131,6 +131,17 @@ int main(void)
printf("[RUN]\tchild: check that we inherited permissions\n"); printf("[RUN]\tchild: check that we inherited permissions\n");
expect_ok(0x80); expect_ok(0x80);
expect_gp(0xed); expect_gp(0xed);
printf("[RUN]\tchild: Extend permissions to 0x81\n");
if (ioperm(0x81, 1, 1) != 0) {
printf("[FAIL]\tioperm(0x81, 1, 1) failed (%d)", errno);
return 1;
}
printf("[RUN]\tchild: Drop permissions to 0x80\n");
if (ioperm(0x80, 1, 0) != 0) {
printf("[FAIL]\tioperm(0x80, 1, 0) failed (%d)", errno);
return 1;
}
expect_gp(0x80);
return 0; return 0;
} else { } else {
int status; int status;
...@@ -146,8 +157,11 @@ int main(void) ...@@ -146,8 +157,11 @@ int main(void)
} }
} }
/* Test the capability checks. */ /* Verify that the child dropping 0x80 did not affect the parent */
printf("\tVerify that unsharing the bitmap worked\n");
expect_ok(0x80);
/* Test the capability checks. */
printf("\tDrop privileges\n"); printf("\tDrop privileges\n");
if (setresuid(1, 1, 1) != 0) { if (setresuid(1, 1, 1) != 0) {
printf("[WARN]\tDropping privileges failed\n"); printf("[WARN]\tDropping privileges failed\n");
......
...@@ -35,6 +35,16 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), ...@@ -35,6 +35,16 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
} }
static void clearhandler(int sig)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = SIG_DFL;
sigemptyset(&sa.sa_mask);
if (sigaction(sig, &sa, 0))
err(1, "sigaction");
}
static jmp_buf jmpbuf; static jmp_buf jmpbuf;
static void sigsegv(int sig, siginfo_t *si, void *ctx_void) static void sigsegv(int sig, siginfo_t *si, void *ctx_void)
...@@ -42,25 +52,128 @@ static void sigsegv(int sig, siginfo_t *si, void *ctx_void) ...@@ -42,25 +52,128 @@ static void sigsegv(int sig, siginfo_t *si, void *ctx_void)
siglongjmp(jmpbuf, 1); siglongjmp(jmpbuf, 1);
} }
static bool try_outb(unsigned short port)
{
sethandler(SIGSEGV, sigsegv, SA_RESETHAND);
if (sigsetjmp(jmpbuf, 1) != 0) {
return false;
} else {
asm volatile ("outb %%al, %w[port]"
: : [port] "Nd" (port), "a" (0));
return true;
}
clearhandler(SIGSEGV);
}
static void expect_ok_outb(unsigned short port)
{
if (!try_outb(port)) {
printf("[FAIL]\toutb to 0x%02hx failed\n", port);
exit(1);
}
printf("[OK]\toutb to 0x%02hx worked\n", port);
}
static void expect_gp_outb(unsigned short port)
{
if (try_outb(port)) {
printf("[FAIL]\toutb to 0x%02hx worked\n", port);
nerrs++;
}
printf("[OK]\toutb to 0x%02hx failed\n", port);
}
static bool try_cli(void)
{
sethandler(SIGSEGV, sigsegv, SA_RESETHAND);
if (sigsetjmp(jmpbuf, 1) != 0) {
return false;
} else {
asm volatile ("cli");
return true;
}
clearhandler(SIGSEGV);
}
static bool try_sti(void)
{
sethandler(SIGSEGV, sigsegv, SA_RESETHAND);
if (sigsetjmp(jmpbuf, 1) != 0) {
return false;
} else {
asm volatile ("sti");
return true;
}
clearhandler(SIGSEGV);
}
static void expect_gp_sti(void)
{
if (try_sti()) {
printf("[FAIL]\tSTI worked\n");
nerrs++;
} else {
printf("[OK]\tSTI faulted\n");
}
}
static void expect_gp_cli(void)
{
if (try_cli()) {
printf("[FAIL]\tCLI worked\n");
nerrs++;
} else {
printf("[OK]\tCLI faulted\n");
}
}
int main(void) int main(void)
{ {
cpu_set_t cpuset; cpu_set_t cpuset;
CPU_ZERO(&cpuset); CPU_ZERO(&cpuset);
CPU_SET(0, &cpuset); CPU_SET(0, &cpuset);
if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
err(1, "sched_setaffinity to CPU 0"); err(1, "sched_setaffinity to CPU 0");
/* Probe for iopl support. Note that iopl(0) works even as nonroot. */ /* Probe for iopl support. Note that iopl(0) works even as nonroot. */
if (iopl(3) != 0) { switch(iopl(3)) {
case 0:
break;
case -ENOSYS:
printf("[OK]\tiopl() nor supported\n");
return 0;
default:
printf("[OK]\tiopl(3) failed (%d) -- try running as root\n", printf("[OK]\tiopl(3) failed (%d) -- try running as root\n",
errno); errno);
return 0; return 0;
} }
/* Restore our original state prior to starting the test. */ /* Make sure that CLI/STI are blocked even with IOPL level 3 */
expect_gp_cli();
expect_gp_sti();
expect_ok_outb(0x80);
/* Establish an I/O bitmap to test the restore */
if (ioperm(0x80, 1, 1) != 0)
err(1, "ioperm(0x80, 1, 1) failed\n");
/* Restore our original state prior to starting the fork test. */
if (iopl(0) != 0) if (iopl(0) != 0)
err(1, "iopl(0)"); err(1, "iopl(0)");
/*
* Verify that IOPL emulation is disabled and the I/O bitmap still
* works.
*/
expect_ok_outb(0x80);
expect_gp_outb(0xed);
/* Drop the I/O bitmap */
if (ioperm(0x80, 1, 0) != 0)
err(1, "ioperm(0x80, 1, 0) failed\n");
pid_t child = fork(); pid_t child = fork();
if (child == -1) if (child == -1)
err(1, "fork"); err(1, "fork");
...@@ -90,14 +203,9 @@ int main(void) ...@@ -90,14 +203,9 @@ int main(void)
printf("[RUN]\tparent: write to 0x80 (should fail)\n"); printf("[RUN]\tparent: write to 0x80 (should fail)\n");
sethandler(SIGSEGV, sigsegv, 0); expect_gp_outb(0x80);
if (sigsetjmp(jmpbuf, 1) != 0) { expect_gp_cli();
printf("[OK]\twrite was denied\n"); expect_gp_sti();
} else {
asm volatile ("outb %%al, $0x80" : : "a" (0));
printf("[FAIL]\twrite was allowed\n");
nerrs++;
}
/* Test the capability checks. */ /* Test the capability checks. */
printf("\tiopl(3)\n"); printf("\tiopl(3)\n");
...@@ -133,4 +241,3 @@ int main(void) ...@@ -133,4 +241,3 @@ int main(void)
done: done:
return nerrs ? 1 : 0; return nerrs ? 1 : 0;
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment