Commit e1d20bea authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 asm updates from Ingo Molnar:
 "The main changes in this cycle were the fsgsbase related preparatory
  patches from Chang S. Bae - but there's also an optimized
  memcpy_flushcache() and a cleanup for the __cmpxchg_double() assembly
  glue"

* 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/fsgsbase/64: Clean up various details
  x86/segments: Introduce the 'CPUNODE' naming to better document the segment limit CPU/node NR trick
  x86/vdso: Initialize the CPU/node NR segment descriptor earlier
  x86/vdso: Introduce helper functions for CPU and node number
  x86/segments/64: Rename the GDT PER_CPU entry to CPU_NUMBER
  x86/fsgsbase/64: Factor out FS/GS segment loading from __switch_to()
  x86/fsgsbase/64: Convert the ELF core dump code to the new FSGSBASE helpers
  x86/fsgsbase/64: Make ptrace use the new FS/GS base helpers
  x86/fsgsbase/64: Introduce FS/GS base helper functions
  x86/fsgsbase/64: Fix ptrace() to read the FS/GS base accurately
  x86/asm: Use CC_SET()/CC_OUT() in __cmpxchg_double()
  x86/asm: Optimize memcpy_flushcache()
parents cbbfb0ae ec3a9418
......@@ -13,14 +13,8 @@
notrace long
__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
{
unsigned int p;
vdso_read_cpunode(cpu, node);
p = __getcpu();
if (cpu)
*cpu = p & VGETCPU_CPU_MASK;
if (node)
*node = p >> 12;
return 0;
}
......
......@@ -332,40 +332,6 @@ static __init int vdso_setup(char *s)
return 0;
}
__setup("vdso=", vdso_setup);
#endif
#ifdef CONFIG_X86_64
static void vgetcpu_cpu_init(void *arg)
{
int cpu = smp_processor_id();
struct desc_struct d = { };
unsigned long node = 0;
#ifdef CONFIG_NUMA
node = cpu_to_node(cpu);
#endif
if (static_cpu_has(X86_FEATURE_RDTSCP))
write_rdtscp_aux((node << 12) | cpu);
/*
* Store cpu number in limit so that it can be loaded
* quickly in user space in vgetcpu. (12 bits for the CPU
* and 8 bits for the node)
*/
d.limit0 = cpu | ((node & 0xf) << 12);
d.limit1 = node >> 4;
d.type = 5; /* RO data, expand down, accessed */
d.dpl = 3; /* Visible to user code */
d.s = 1; /* Not a system segment */
d.p = 1; /* Present */
d.d = 1; /* 32-bit */
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
}
static int vgetcpu_online(unsigned int cpu)
{
return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1);
}
static int __init init_vdso(void)
{
......@@ -375,9 +341,7 @@ static int __init init_vdso(void)
init_vdso_image(&vdso_image_x32);
#endif
/* notifier priority > KVM */
return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE,
"x86/vdso/vma:online", vgetcpu_online, NULL);
return 0;
}
subsys_initcall(init_vdso);
#endif /* CONFIG_X86_64 */
......@@ -242,10 +242,12 @@ extern void __add_wrong_size(void)
BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \
VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long))); \
VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2)); \
asm volatile(pfx "cmpxchg%c4b %2; sete %0" \
: "=a" (__ret), "+d" (__old2), \
"+m" (*(p1)), "+m" (*(p2)) \
: "i" (2 * sizeof(long)), "a" (__old1), \
asm volatile(pfx "cmpxchg%c5b %1" \
CC_SET(e) \
: CC_OUT(e) (__ret), \
"+m" (*(p1)), "+m" (*(p2)), \
"+a" (__old1), "+d" (__old2) \
: "i" (2 * sizeof(long)), \
"b" (__new1), "c" (__new2)); \
__ret; \
})
......
......@@ -10,6 +10,7 @@
#include <asm/ptrace.h>
#include <asm/user.h>
#include <asm/auxvec.h>
#include <asm/fsgsbase.h>
typedef unsigned long elf_greg_t;
......@@ -204,7 +205,6 @@ void set_personality_ia32(bool);
#define ELF_CORE_COPY_REGS(pr_reg, regs) \
do { \
unsigned long base; \
unsigned v; \
(pr_reg)[0] = (regs)->r15; \
(pr_reg)[1] = (regs)->r14; \
......@@ -227,8 +227,8 @@ do { \
(pr_reg)[18] = (regs)->flags; \
(pr_reg)[19] = (regs)->sp; \
(pr_reg)[20] = (regs)->ss; \
rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \
rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \
(pr_reg)[21] = x86_fsbase_read_cpu(); \
(pr_reg)[22] = x86_gsbase_read_cpu_inactive(); \
asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_FSGSBASE_H
#define _ASM_FSGSBASE_H
#ifndef __ASSEMBLY__
#ifdef CONFIG_X86_64
#include <asm/msr-index.h>
/*
* Read/write a task's FSBASE or GSBASE. This returns the value that
* the FS/GS base would have (if the task were to be resumed). These
* work on the current task or on a non-running (typically stopped
* ptrace child) task.
*/
extern unsigned long x86_fsbase_read_task(struct task_struct *task);
extern unsigned long x86_gsbase_read_task(struct task_struct *task);
extern int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase);
extern int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase);
/* Helper functions for reading/writing FS/GS base */
static inline unsigned long x86_fsbase_read_cpu(void)
{
unsigned long fsbase;
rdmsrl(MSR_FS_BASE, fsbase);
return fsbase;
}
static inline unsigned long x86_gsbase_read_cpu_inactive(void)
{
unsigned long gsbase;
rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
return gsbase;
}
extern void x86_fsbase_write_cpu(unsigned long fsbase);
extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
#endif /* CONFIG_X86_64 */
#endif /* __ASSEMBLY__ */
#endif /* _ASM_FSGSBASE_H */
......@@ -186,8 +186,7 @@
#define GDT_ENTRY_TLS_MIN 12
#define GDT_ENTRY_TLS_MAX 14
/* Abused to load per CPU data from limit */
#define GDT_ENTRY_PER_CPU 15
#define GDT_ENTRY_CPUNODE 15
/*
* Number of entries in the GDT table:
......@@ -207,7 +206,7 @@
#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
#define __USER32_DS __USER_DS
#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3)
#define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3)
#endif
......@@ -225,6 +224,47 @@
#define GDT_ENTRY_TLS_ENTRIES 3
#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8)
#ifdef CONFIG_X86_64
/* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */
#define VDSO_CPUNODE_BITS 12
#define VDSO_CPUNODE_MASK 0xfff
#ifndef __ASSEMBLY__
/* Helper functions to store/load CPU and node numbers */
static inline unsigned long vdso_encode_cpunode(int cpu, unsigned long node)
{
return (node << VDSO_CPUNODE_BITS) | cpu;
}
static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node)
{
unsigned int p;
/*
* Load CPU and node number from the GDT. LSL is faster than RDTSCP
* and works on all CPUs. This is volatile so that it orders
* correctly with respect to barrier() and to keep GCC from cleverly
* hoisting it out of the calling function.
*
* If RDPID is available, use it.
*/
alternative_io ("lsl %[seg],%[p]",
".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */
X86_FEATURE_RDPID,
[p] "=a" (p), [seg] "r" (__CPUNODE_SEG));
if (cpu)
*cpu = (p & VDSO_CPUNODE_MASK);
if (node)
*node = (p >> VDSO_CPUNODE_BITS);
}
#endif /* !__ASSEMBLY__ */
#endif /* CONFIG_X86_64 */
#ifdef __KERNEL__
/*
......
......@@ -149,7 +149,25 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt)
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
void memcpy_flushcache(void *dst, const void *src, size_t cnt);
void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
{
if (__builtin_constant_p(cnt)) {
switch (cnt) {
case 4:
asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src));
return;
case 8:
asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
return;
case 16:
asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8)));
return;
}
}
__memcpy_flushcache(dst, src, cnt);
}
#endif
#endif /* __KERNEL__ */
......
......@@ -77,30 +77,4 @@ static inline void gtod_write_end(struct vsyscall_gtod_data *s)
++s->seq;
}
#ifdef CONFIG_X86_64
#define VGETCPU_CPU_MASK 0xfff
static inline unsigned int __getcpu(void)
{
unsigned int p;
/*
* Load per CPU data from GDT. LSL is faster than RDTSCP and
* works on all CPUs. This is volatile so that it orders
* correctly wrt barrier() and to keep gcc from cleverly
* hoisting it out of the calling function.
*
* If RDPID is available, use it.
*/
alternative_io ("lsl %[seg],%[p]",
".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */
X86_FEATURE_RDPID,
[p] "=a" (p), [seg] "r" (__PER_CPU_SEG));
return p;
}
#endif /* CONFIG_X86_64 */
#endif /* _ASM_X86_VGTOD_H */
......@@ -1669,6 +1669,29 @@ static void wait_for_master_cpu(int cpu)
#endif
}
#ifdef CONFIG_X86_64
static void setup_getcpu(int cpu)
{
unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
struct desc_struct d = { };
if (static_cpu_has(X86_FEATURE_RDTSCP))
write_rdtscp_aux(cpudata);
/* Store CPU and node number in limit. */
d.limit0 = cpudata;
d.limit1 = cpudata >> 16;
d.type = 5; /* RO data, expand down, accessed */
d.dpl = 3; /* Visible to user code */
d.s = 1; /* Not a system segment */
d.p = 1; /* Present */
d.d = 1; /* 32-bit */
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
}
#endif
/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
......@@ -1706,6 +1729,7 @@ void cpu_init(void)
early_cpu_to_node(cpu) != NUMA_NO_NODE)
set_numa_node(early_cpu_to_node(cpu));
#endif
setup_getcpu(cpu);
me = current;
......
......@@ -54,6 +54,7 @@
#include <asm/vdso.h>
#include <asm/intel_rdt_sched.h>
#include <asm/unistd.h>
#include <asm/fsgsbase.h>
#ifdef CONFIG_IA32_EMULATION
/* Not included via unistd.h */
#include <asm/unistd_32_ia32.h>
......@@ -286,6 +287,138 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
}
}
static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
struct thread_struct *next)
{
load_seg_legacy(prev->fsindex, prev->fsbase,
next->fsindex, next->fsbase, FS);
load_seg_legacy(prev->gsindex, prev->gsbase,
next->gsindex, next->gsbase, GS);
}
static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
unsigned short selector)
{
unsigned short idx = selector >> 3;
unsigned long base;
if (likely((selector & SEGMENT_TI_MASK) == 0)) {
if (unlikely(idx >= GDT_ENTRIES))
return 0;
/*
* There are no user segments in the GDT with nonzero bases
* other than the TLS segments.
*/
if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
return 0;
idx -= GDT_ENTRY_TLS_MIN;
base = get_desc_base(&task->thread.tls_array[idx]);
} else {
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
/*
* If performance here mattered, we could protect the LDT
* with RCU. This is a slow path, though, so we can just
* take the mutex.
*/
mutex_lock(&task->mm->context.lock);
ldt = task->mm->context.ldt;
if (unlikely(idx >= ldt->nr_entries))
base = 0;
else
base = get_desc_base(ldt->entries + idx);
mutex_unlock(&task->mm->context.lock);
#else
base = 0;
#endif
}
return base;
}
void x86_fsbase_write_cpu(unsigned long fsbase)
{
/*
* Set the selector to 0 as a notion, that the segment base is
* overwritten, which will be checked for skipping the segment load
* during context switch.
*/
loadseg(FS, 0);
wrmsrl(MSR_FS_BASE, fsbase);
}
void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
{
/* Set the selector to 0 for the same reason as %fs above. */
loadseg(GS, 0);
wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
}
unsigned long x86_fsbase_read_task(struct task_struct *task)
{
unsigned long fsbase;
if (task == current)
fsbase = x86_fsbase_read_cpu();
else if (task->thread.fsindex == 0)
fsbase = task->thread.fsbase;
else
fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
return fsbase;
}
unsigned long x86_gsbase_read_task(struct task_struct *task)
{
unsigned long gsbase;
if (task == current)
gsbase = x86_gsbase_read_cpu_inactive();
else if (task->thread.gsindex == 0)
gsbase = task->thread.gsbase;
else
gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
return gsbase;
}
int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
{
/*
* Not strictly needed for %fs, but do it for symmetry
* with %gs
*/
if (unlikely(fsbase >= TASK_SIZE_MAX))
return -EPERM;
preempt_disable();
task->thread.fsbase = fsbase;
if (task == current)
x86_fsbase_write_cpu(fsbase);
task->thread.fsindex = 0;
preempt_enable();
return 0;
}
int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
{
if (unlikely(gsbase >= TASK_SIZE_MAX))
return -EPERM;
preempt_disable();
task->thread.gsbase = gsbase;
if (task == current)
x86_gsbase_write_cpu_inactive(gsbase);
task->thread.gsindex = 0;
preempt_enable();
return 0;
}
int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
......@@ -473,10 +606,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
load_seg_legacy(prev->fsindex, prev->fsbase,
next->fsindex, next->fsbase, FS);
load_seg_legacy(prev->gsindex, prev->gsbase,
next->gsindex, next->gsbase, GS);
x86_fsgsbase_load(prev, next);
switch_fpu_finish(next_fpu, cpu);
......@@ -627,54 +757,25 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
{
int ret = 0;
int doit = task == current;
int cpu;
switch (option) {
case ARCH_SET_GS:
if (arg2 >= TASK_SIZE_MAX)
return -EPERM;
cpu = get_cpu();
task->thread.gsindex = 0;
task->thread.gsbase = arg2;
if (doit) {
load_gs_index(0);
ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2);
}
put_cpu();
case ARCH_SET_GS: {
ret = x86_gsbase_write_task(task, arg2);
break;
case ARCH_SET_FS:
/* Not strictly needed for fs, but do it for symmetry
with gs */
if (arg2 >= TASK_SIZE_MAX)
return -EPERM;
cpu = get_cpu();
task->thread.fsindex = 0;
task->thread.fsbase = arg2;
if (doit) {
/* set the selector to 0 to not confuse __switch_to */
loadsegment(fs, 0);
ret = wrmsrl_safe(MSR_FS_BASE, arg2);
}
put_cpu();
case ARCH_SET_FS: {
ret = x86_fsbase_write_task(task, arg2);
break;
}
case ARCH_GET_FS: {
unsigned long base;
unsigned long base = x86_fsbase_read_task(task);
if (doit)
rdmsrl(MSR_FS_BASE, base);
else
base = task->thread.fsbase;
ret = put_user(base, (unsigned long __user *)arg2);
break;
}
case ARCH_GET_GS: {
unsigned long base;
unsigned long base = x86_gsbase_read_task(task);
if (doit)
rdmsrl(MSR_KERNEL_GS_BASE, base);
else
base = task->thread.gsbase;
ret = put_user(base, (unsigned long __user *)arg2);
break;
}
......
......@@ -39,6 +39,7 @@
#include <asm/hw_breakpoint.h>
#include <asm/traps.h>
#include <asm/syscall.h>
#include <asm/fsgsbase.h>
#include "tls.h"
......@@ -396,12 +397,11 @@ static int putreg(struct task_struct *child,
if (value >= TASK_SIZE_MAX)
return -EIO;
/*
* When changing the segment base, use do_arch_prctl_64
* to set either thread.fs or thread.fsindex and the
* corresponding GDT slot.
* When changing the FS base, use the same
* mechanism as for do_arch_prctl_64().
*/
if (child->thread.fsbase != value)
return do_arch_prctl_64(child, ARCH_SET_FS, value);
return x86_fsbase_write_task(child, value);
return 0;
case offsetof(struct user_regs_struct,gs_base):
/*
......@@ -410,7 +410,7 @@ static int putreg(struct task_struct *child,
if (value >= TASK_SIZE_MAX)
return -EIO;
if (child->thread.gsbase != value)
return do_arch_prctl_64(child, ARCH_SET_GS, value);
return x86_gsbase_write_task(child, value);
return 0;
#endif
}
......@@ -434,20 +434,10 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
return get_flags(task);
#ifdef CONFIG_X86_64
case offsetof(struct user_regs_struct, fs_base): {
/*
* XXX: This will not behave as expected if called on
* current or if fsindex != 0.
*/
return task->thread.fsbase;
}
case offsetof(struct user_regs_struct, gs_base): {
/*
* XXX: This will not behave as expected if called on
* current or if fsindex != 0.
*/
return task->thread.gsbase;
}
case offsetof(struct user_regs_struct, fs_base):
return x86_fsbase_read_task(task);
case offsetof(struct user_regs_struct, gs_base):
return x86_gsbase_read_task(task);
#endif
}
......
......@@ -153,7 +153,7 @@ long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
return rc;
}
void memcpy_flushcache(void *_dst, const void *_src, size_t size)
void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
{
unsigned long dest = (unsigned long) _dst;
unsigned long source = (unsigned long) _src;
......@@ -216,7 +216,7 @@ void memcpy_flushcache(void *_dst, const void *_src, size_t size)
clean_cache_range((void *) dest, size);
}
}
EXPORT_SYMBOL_GPL(memcpy_flushcache);
EXPORT_SYMBOL_GPL(__memcpy_flushcache);
void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
size_t len)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment