Commit a13dc4d4 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'x86_cleanups_for_v5.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 cleanups from Borislav Petkov:

 - Serious sanitization and cleanup of the whole APERF/MPERF and
   frequency invariance code along with removing the need for
   unnecessary IPIs

 - Finally remove a.out support

 - The usual trivial cleanups and fixes all over x86

* tag 'x86_cleanups_for_v5.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (21 commits)
  x86: Remove empty files
  x86/speculation: Add missing srbds=off to the mitigations= help text
  x86/prctl: Remove pointless task argument
  x86/aperfperf: Make it correct on 32bit and UP kernels
  x86/aperfmperf: Integrate the fallback code from show_cpuinfo()
  x86/aperfmperf: Replace arch_freq_get_on_cpu()
  x86/aperfmperf: Replace aperfmperf_get_khz()
  x86/aperfmperf: Store aperf/mperf data for cpu frequency reads
  x86/aperfmperf: Make parts of the frequency invariance code unconditional
  x86/aperfmperf: Restructure arch_scale_freq_tick()
  x86/aperfmperf: Put frequency invariance aperf/mperf data into a struct
  x86/aperfmperf: Untangle Intel and AMD frequency invariance init
  x86/aperfmperf: Separate AP/BP frequency invariance init
  x86/smp: Move APERF/MPERF code where it belongs
  x86/aperfmperf: Dont wake idle CPUs in arch_freq_get_on_cpu()
  x86/process: Fix kernel-doc warning due to a changed function name
  x86: Remove a.out support
  x86/mm: Replace nodes_weight() with nodes_empty() where appropriate
  x86: Replace cpumask_weight() with cpumask_empty() where appropriate
  x86/pkeys: Remove __arch_set_user_pkey_access() declaration
  ...
parents 1de564b8 d936411d
......@@ -3147,6 +3147,7 @@
mds=off [X86]
tsx_async_abort=off [X86]
kvm.nx_huge_pages=off [X86]
srbds=off [X86,INTEL]
no_entry_flush [PPC]
no_uaccess_flush [PPC]
......
......@@ -7385,7 +7385,6 @@ L: linux-mm@kvack.org
S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/execve
F: arch/alpha/kernel/binfmt_loader.c
F: arch/x86/ia32/ia32_aout.c
F: fs/*binfmt_*.c
F: fs/exec.c
F: include/linux/binfmts.h
......
......@@ -2842,13 +2842,6 @@ config IA32_EMULATION
64-bit kernel. You should likely turn this on, unless you're
100% sure that you don't have any 32-bit programs left.
config IA32_AOUT
tristate "IA32 a.out support"
depends on IA32_EMULATION
depends on BROKEN
help
Support old a.out binaries in the 32bit emulation.
config X86_X32_ABI
bool "x32 ABI for 64-bit mode"
depends on X86_64
......
......@@ -5,7 +5,5 @@
obj-$(CONFIG_IA32_EMULATION) := ia32_signal.o
obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
audit-class-$(CONFIG_AUDIT) := audit.o
obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y)
// SPDX-License-Identifier: GPL-2.0-only
/*
* a.out loader for x86-64
*
* Copyright (C) 1991, 1992, 1996 Linus Torvalds
* Hacked together by Andi Kleen
*/
#include <linux/module.h>
#include <linux/time.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/a.out.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/ptrace.h>
#include <linux/user.h>
#include <linux/binfmts.h>
#include <linux/personality.h>
#include <linux/init.h>
#include <linux/jiffies.h>
#include <linux/perf_event.h>
#include <linux/sched/task_stack.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/user32.h>
#include <asm/ia32.h>
#undef WARN_OLD
static int load_aout_binary(struct linux_binprm *);
static int load_aout_library(struct file *);
static struct linux_binfmt aout_format = {
.module = THIS_MODULE,
.load_binary = load_aout_binary,
.load_shlib = load_aout_library,
};
static int set_brk(unsigned long start, unsigned long end)
{
start = PAGE_ALIGN(start);
end = PAGE_ALIGN(end);
if (end <= start)
return 0;
return vm_brk(start, end - start);
}
/*
* create_aout_tables() parses the env- and arg-strings in new user
* memory and creates the pointer tables from them, and puts their
* addresses on the "stack", returning the new stack pointer value.
*/
static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
{
u32 __user *argv, *envp, *sp;
int argc = bprm->argc, envc = bprm->envc;
sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p);
sp -= envc+1;
envp = sp;
sp -= argc+1;
argv = sp;
put_user((unsigned long) envp, --sp);
put_user((unsigned long) argv, --sp);
put_user(argc, --sp);
current->mm->arg_start = (unsigned long) p;
while (argc-- > 0) {
char c;
put_user((u32)(unsigned long)p, argv++);
do {
get_user(c, p++);
} while (c);
}
put_user(0, argv);
current->mm->arg_end = current->mm->env_start = (unsigned long) p;
while (envc-- > 0) {
char c;
put_user((u32)(unsigned long)p, envp++);
do {
get_user(c, p++);
} while (c);
}
put_user(0, envp);
current->mm->env_end = (unsigned long) p;
return sp;
}
/*
* These are the functions used to load a.out style executables and shared
* libraries. There is no binary dependent code anywhere else.
*/
static int load_aout_binary(struct linux_binprm *bprm)
{
unsigned long error, fd_offset, rlim;
struct pt_regs *regs = current_pt_regs();
struct exec ex;
int retval;
ex = *((struct exec *) bprm->buf); /* exec-header */
if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
N_TRSIZE(ex) || N_DRSIZE(ex) ||
i_size_read(file_inode(bprm->file)) <
ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
return -ENOEXEC;
}
fd_offset = N_TXTOFF(ex);
/* Check initial limits. This avoids letting people circumvent
* size limits imposed on them by creating programs with large
* arrays in the data or bss.
*/
rlim = rlimit(RLIMIT_DATA);
if (rlim >= RLIM_INFINITY)
rlim = ~0;
if (ex.a_data + ex.a_bss > rlim)
return -ENOMEM;
/* Flush all traces of the currently running executable */
retval = begin_new_exec(bprm);
if (retval)
return retval;
/* OK, This is the point of no return */
set_personality(PER_LINUX);
set_personality_ia32(false);
setup_new_exec(bprm);
regs->cs = __USER32_CS;
regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
regs->r13 = regs->r14 = regs->r15 = 0;
current->mm->end_code = ex.a_text +
(current->mm->start_code = N_TXTADDR(ex));
current->mm->end_data = ex.a_data +
(current->mm->start_data = N_DATADDR(ex));
current->mm->brk = ex.a_bss +
(current->mm->start_brk = N_BSSADDR(ex));
retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
if (retval < 0)
return retval;
if (N_MAGIC(ex) == OMAGIC) {
unsigned long text_addr, map_size;
text_addr = N_TXTADDR(ex);
map_size = ex.a_text+ex.a_data;
error = vm_brk(text_addr & PAGE_MASK, map_size);
if (error)
return error;
error = read_code(bprm->file, text_addr, 32,
ex.a_text + ex.a_data);
if ((signed long)error < 0)
return error;
} else {
#ifdef WARN_OLD
static unsigned long error_time, error_time2;
if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
(N_MAGIC(ex) != NMAGIC) &&
time_after(jiffies, error_time2 + 5*HZ)) {
printk(KERN_NOTICE "executable not page aligned\n");
error_time2 = jiffies;
}
if ((fd_offset & ~PAGE_MASK) != 0 &&
time_after(jiffies, error_time + 5*HZ)) {
printk(KERN_WARNING
"fd_offset is not page aligned. Please convert "
"program: %pD\n",
bprm->file);
error_time = jiffies;
}
#endif
if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) {
error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
if (error)
return error;
read_code(bprm->file, N_TXTADDR(ex), fd_offset,
ex.a_text+ex.a_data);
goto beyond_if;
}
error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
PROT_READ | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE | MAP_32BIT,
fd_offset);
if (error != N_TXTADDR(ex))
return error;
error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE | MAP_32BIT,
fd_offset + ex.a_text);
if (error != N_DATADDR(ex))
return error;
}
beyond_if:
error = set_brk(current->mm->start_brk, current->mm->brk);
if (error)
return error;
set_binfmt(&aout_format);
current->mm->start_stack =
(unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
/* start thread */
loadsegment(fs, 0);
loadsegment(ds, __USER32_DS);
loadsegment(es, __USER32_DS);
load_gs_index(0);
(regs)->ip = ex.a_entry;
(regs)->sp = current->mm->start_stack;
(regs)->flags = 0x200;
(regs)->cs = __USER32_CS;
(regs)->ss = __USER32_DS;
regs->r8 = regs->r9 = regs->r10 = regs->r11 =
regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
return 0;
}
static int load_aout_library(struct file *file)
{
unsigned long bss, start_addr, len, error;
int retval;
struct exec ex;
loff_t pos = 0;
retval = -ENOEXEC;
error = kernel_read(file, &ex, sizeof(ex), &pos);
if (error != sizeof(ex))
goto out;
/* We come in here for the regular a.out style of shared libraries */
if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
i_size_read(file_inode(file)) <
ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
goto out;
}
if (N_FLAGS(ex))
goto out;
/* For QMAGIC, the starting address is 0x20 into the page. We mask
this off to get the starting address for the page */
start_addr = ex.a_entry & 0xfffff000;
if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
#ifdef WARN_OLD
static unsigned long error_time;
if (time_after(jiffies, error_time + 5*HZ)) {
printk(KERN_WARNING
"N_TXTOFF is not page aligned. Please convert "
"library: %pD\n",
file);
error_time = jiffies;
}
#endif
retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
if (retval)
goto out;
read_code(file, start_addr, N_TXTOFF(ex),
ex.a_text + ex.a_data);
retval = 0;
goto out;
}
/* Now use mmap to map the library into memory. */
error = vm_mmap(file, start_addr, ex.a_text + ex.a_data,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE | MAP_32BIT,
N_TXTOFF(ex));
retval = error;
if (error != start_addr)
goto out;
len = PAGE_ALIGN(ex.a_text + ex.a_data);
bss = ex.a_text + ex.a_data + ex.a_bss;
if (bss > len) {
retval = vm_brk(start_addr + len, bss - len);
if (retval)
goto out;
}
retval = 0;
out:
return retval;
}
static int __init init_aout_binfmt(void)
{
register_binfmt(&aout_format);
return 0;
}
static void __exit exit_aout_binfmt(void)
{
unregister_binfmt(&aout_format);
}
module_init(init_aout_binfmt);
module_exit(exit_aout_binfmt);
MODULE_LICENSE("GPL");
......@@ -36,6 +36,8 @@ extern int _debug_hotplug_cpu(int cpu, int action);
#endif
#endif
extern void ap_init_aperfmperf(void);
int mwait_usable(const struct cpuinfo_x86 *);
unsigned int x86_family(unsigned int sig);
......
......@@ -162,7 +162,6 @@ static inline bool fpstate_is_confidential(struct fpu_guest *gfpu)
}
/* prctl */
struct task_struct;
extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2);
extern long fpu_xstate_prctl(int option, unsigned long arg2);
#endif /* _ASM_X86_FPU_API_H */
......@@ -41,9 +41,6 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
return __arch_override_mprotect_pkey(vma, prot, pkey);
}
extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)
#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map)
......@@ -118,11 +115,6 @@ int mm_pkey_free(struct mm_struct *mm, int pkey)
return 0;
}
extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
static inline int vma_pkey(struct vm_area_struct *vma)
{
unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
......
......@@ -42,7 +42,6 @@ void x86_configure_nx(void);
extern int reboot_force;
long do_arch_prctl_common(struct task_struct *task, int option,
unsigned long arg2);
long do_arch_prctl_common(int option, unsigned long arg2);
#endif /* _ASM_X86_PROTO_H */
......@@ -212,30 +212,19 @@ static inline long arch_scale_freq_capacity(int cpu)
}
#define arch_scale_freq_capacity arch_scale_freq_capacity
extern void arch_scale_freq_tick(void);
#define arch_scale_freq_tick arch_scale_freq_tick
extern void arch_set_max_freq_ratio(bool turbo_disabled);
void init_freq_invariance(bool secondary, bool cppc_ready);
extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled);
#else
static inline void arch_set_max_freq_ratio(bool turbo_disabled)
{
}
static inline void init_freq_invariance(bool secondary, bool cppc_ready)
{
}
static inline void arch_set_max_freq_ratio(bool turbo_disabled) { }
static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { }
#endif
extern void arch_scale_freq_tick(void);
#define arch_scale_freq_tick arch_scale_freq_tick
#ifdef CONFIG_ACPI_CPPC_LIB
void init_freq_invariance_cppc(void);
#define arch_init_invariance_cppc init_freq_invariance_cppc
bool amd_set_max_freq_ratio(u64 *ratio);
#else
static inline bool amd_set_max_freq_ratio(u64 *ratio)
{
return false;
}
#endif
#endif /* _ASM_X86_TOPOLOGY_H */
......@@ -50,20 +50,17 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
return err;
}
bool amd_set_max_freq_ratio(u64 *ratio)
static void amd_set_max_freq_ratio(void)
{
struct cppc_perf_caps perf_caps;
u64 highest_perf, nominal_perf;
u64 perf_ratio;
int rc;
if (!ratio)
return false;
rc = cppc_get_perf_caps(0, &perf_caps);
if (rc) {
pr_debug("Could not retrieve perf counters (%d)\n", rc);
return false;
return;
}
highest_perf = amd_get_highest_perf();
......@@ -71,7 +68,7 @@ bool amd_set_max_freq_ratio(u64 *ratio)
if (!highest_perf || !nominal_perf) {
pr_debug("Could not retrieve highest or nominal performance\n");
return false;
return;
}
perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
......@@ -79,25 +76,27 @@ bool amd_set_max_freq_ratio(u64 *ratio)
perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
if (!perf_ratio) {
pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
return false;
return;
}
*ratio = perf_ratio;
arch_set_max_freq_ratio(false);
return true;
freq_invariance_set_perf_ratio(perf_ratio, false);
}
static DEFINE_MUTEX(freq_invariance_lock);
void init_freq_invariance_cppc(void)
{
static bool secondary;
static bool init_done;
mutex_lock(&freq_invariance_lock);
if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
return;
init_freq_invariance(secondary, true);
secondary = true;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return;
mutex_lock(&freq_invariance_lock);
if (!init_done)
amd_set_max_freq_ratio();
init_done = true;
mutex_unlock(&freq_invariance_lock);
}
......@@ -6,146 +6,446 @@
* Copyright (C) 2017 Intel Corp.
* Author: Len Brown <len.brown@intel.com>
*/
#include <linux/cpufreq.h>
#include <linux/delay.h>
#include <linux/ktime.h>
#include <linux/math64.h>
#include <linux/percpu.h>
#include <linux/cpufreq.h>
#include <linux/smp.h>
#include <linux/sched/isolation.h>
#include <linux/rcupdate.h>
#include <linux/sched/isolation.h>
#include <linux/sched/topology.h>
#include <linux/smp.h>
#include <linux/syscore_ops.h>
#include <asm/cpu.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#include "cpu.h"
struct aperfmperf_sample {
unsigned int khz;
atomic_t scfpending;
ktime_t time;
u64 aperf;
u64 mperf;
struct aperfmperf {
seqcount_t seq;
unsigned long last_update;
u64 acnt;
u64 mcnt;
u64 aperf;
u64 mperf;
};
static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
.seq = SEQCNT_ZERO(cpu_samples.seq)
};
#define APERFMPERF_CACHE_THRESHOLD_MS 10
#define APERFMPERF_REFRESH_DELAY_MS 10
#define APERFMPERF_STALE_THRESHOLD_MS 1000
static void init_counter_refs(void)
{
u64 aperf, mperf;
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
this_cpu_write(cpu_samples.aperf, aperf);
this_cpu_write(cpu_samples.mperf, mperf);
}
#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
/*
* aperfmperf_snapshot_khz()
* On the current CPU, snapshot APERF, MPERF, and jiffies
* unless we already did it within 10ms
* calculate kHz, save snapshot
* APERF/MPERF frequency ratio computation.
*
* The scheduler wants to do frequency invariant accounting and needs a <1
* ratio to account for the 'current' frequency, corresponding to
* freq_curr / freq_max.
*
* Since the frequency freq_curr on x86 is controlled by micro-controller and
* our P-state setting is little more than a request/hint, we need to observe
* the effective frequency 'BusyMHz', i.e. the average frequency over a time
* interval after discarding idle time. This is given by:
*
* BusyMHz = delta_APERF / delta_MPERF * freq_base
*
* where freq_base is the max non-turbo P-state.
*
* The freq_max term has to be set to a somewhat arbitrary value, because we
* can't know which turbo states will be available at a given point in time:
* it all depends on the thermal headroom of the entire package. We set it to
* the turbo level with 4 cores active.
*
* Benchmarks show that's a good compromise between the 1C turbo ratio
* (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
* which would ignore the entire turbo range (a conspicuous part, making
* freq_curr/freq_max always maxed out).
*
* An exception to the heuristic above is the Atom uarch, where we choose the
* highest turbo level for freq_max since Atom's are generally oriented towards
* power efficiency.
*
* Setting freq_max to anything less than the 1C turbo ratio makes the ratio
* freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
*/
static void aperfmperf_snapshot_khz(void *dummy)
DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
void arch_set_max_freq_ratio(bool turbo_disabled)
{
u64 aperf, aperf_delta;
u64 mperf, mperf_delta;
struct aperfmperf_sample *s = this_cpu_ptr(&samples);
unsigned long flags;
arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
arch_turbo_freq_ratio;
}
EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
local_irq_save(flags);
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
local_irq_restore(flags);
static bool __init turbo_disabled(void)
{
u64 misc_en;
int err;
err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
if (err)
return false;
return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
}
static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{
int err;
err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
if (err)
return false;
err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
if (err)
return false;
*base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
*turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
return true;
}
#define X86_MATCH(model) \
X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
X86_MATCH(XEON_PHI_KNL),
X86_MATCH(XEON_PHI_KNM),
{}
};
static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
X86_MATCH(SKYLAKE_X),
{}
};
static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
X86_MATCH(ATOM_GOLDMONT),
X86_MATCH(ATOM_GOLDMONT_D),
X86_MATCH(ATOM_GOLDMONT_PLUS),
{}
};
static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
int num_delta_fratio)
{
int fratio, delta_fratio, found;
int err, i;
u64 msr;
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
if (err)
return false;
fratio = (msr >> 8) & 0xFF;
i = 16;
found = 0;
do {
if (found >= num_delta_fratio) {
*turbo_freq = fratio;
return true;
}
delta_fratio = (msr >> (i + 5)) & 0x7;
if (delta_fratio) {
found += 1;
fratio -= delta_fratio;
}
i += 8;
} while (i < 64);
return true;
}
static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
{
u64 ratios, counts;
u32 group_size;
int err, i;
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
if (err)
return false;
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
if (err)
return false;
for (i = 0; i < 64; i += 8) {
group_size = (counts >> i) & 0xFF;
if (group_size >= size) {
*turbo_freq = (ratios >> i) & 0xFF;
return true;
}
}
return false;
}
aperf_delta = aperf - s->aperf;
mperf_delta = mperf - s->mperf;
static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{
u64 msr;
int err;
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
if (err)
return false;
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
*turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
/* The CPU may have less than 4 cores */
if (!*turbo_freq)
*turbo_freq = msr & 0xFF; /* 1C turbo */
return true;
}
static bool __init intel_set_max_freq_ratio(void)
{
u64 base_freq, turbo_freq;
u64 turbo_ratio;
if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
goto out;
if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
goto out;
if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
goto out;
if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
goto out;
if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
goto out;
return false;
out:
/*
* There is no architectural guarantee that MPERF
* increments faster than we can read it.
* Some hypervisors advertise X86_FEATURE_APERFMPERF
* but then fill all MSR's with zeroes.
* Some CPUs have turbo boost but don't declare any turbo ratio
* in MSR_TURBO_RATIO_LIMIT.
*/
if (mperf_delta == 0)
return;
if (!base_freq || !turbo_freq) {
pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
return false;
}
s->time = ktime_get();
s->aperf = aperf;
s->mperf = mperf;
s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
atomic_set_release(&s->scfpending, 0);
turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
if (!turbo_ratio) {
pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
return false;
}
arch_turbo_freq_ratio = turbo_ratio;
arch_set_max_freq_ratio(turbo_disabled());
return true;
}
static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
#ifdef CONFIG_PM_SLEEP
static struct syscore_ops freq_invariance_syscore_ops = {
.resume = init_counter_refs,
};
static void register_freq_invariance_syscore_ops(void)
{
s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
register_syscore_ops(&freq_invariance_syscore_ops);
}
#else
static inline void register_freq_invariance_syscore_ops(void) {}
#endif
/* Don't bother re-computing within the cache threshold time. */
if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
return true;
static void freq_invariance_enable(void)
{
if (static_branch_unlikely(&arch_scale_freq_key)) {
WARN_ON_ONCE(1);
return;
}
static_branch_enable(&arch_scale_freq_key);
register_freq_invariance_syscore_ops();
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
}
void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
{
arch_turbo_freq_ratio = ratio;
arch_set_max_freq_ratio(turbo_disabled);
freq_invariance_enable();
}
static void __init bp_init_freq_invariance(void)
{
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return;
if (!atomic_xchg(&s->scfpending, 1) || wait)
smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
if (intel_set_max_freq_ratio())
freq_invariance_enable();
}
/* Return false if the previous iteration was too long ago. */
return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
static void disable_freq_invariance_workfn(struct work_struct *work)
{
static_branch_disable(&arch_scale_freq_key);
}
unsigned int aperfmperf_get_khz(int cpu)
static DECLARE_WORK(disable_freq_invariance_work,
disable_freq_invariance_workfn);
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
static void scale_freq_tick(u64 acnt, u64 mcnt)
{
if (!cpu_khz)
return 0;
u64 freq_scale;
if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return 0;
if (!arch_scale_freq_invariant())
return;
if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
return 0;
if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
goto error;
if (rcu_is_idle_cpu(cpu))
return 0; /* Idle CPUs are completely uninteresting. */
if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
goto error;
aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
return per_cpu(samples.khz, cpu);
freq_scale = div64_u64(acnt, mcnt);
if (!freq_scale)
goto error;
if (freq_scale > SCHED_CAPACITY_SCALE)
freq_scale = SCHED_CAPACITY_SCALE;
this_cpu_write(arch_freq_scale, freq_scale);
return;
error:
pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
schedule_work(&disable_freq_invariance_work);
}
#else
static inline void bp_init_freq_invariance(void) { }
static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
#endif /* CONFIG_X86_64 && CONFIG_SMP */
void arch_freq_prepare_all(void)
void arch_scale_freq_tick(void)
{
ktime_t now = ktime_get();
bool wait = false;
int cpu;
struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
u64 acnt, mcnt, aperf, mperf;
if (!cpu_khz)
if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
return;
if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return;
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
acnt = aperf - s->aperf;
mcnt = mperf - s->mperf;
for_each_online_cpu(cpu) {
if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
continue;
if (rcu_is_idle_cpu(cpu))
continue; /* Idle CPUs are completely uninteresting. */
if (!aperfmperf_snapshot_cpu(cpu, now, false))
wait = true;
}
s->aperf = aperf;
s->mperf = mperf;
raw_write_seqcount_begin(&s->seq);
s->last_update = jiffies;
s->acnt = acnt;
s->mcnt = mcnt;
raw_write_seqcount_end(&s->seq);
if (wait)
msleep(APERFMPERF_REFRESH_DELAY_MS);
scale_freq_tick(acnt, mcnt);
}
/*
* Discard samples older than the define maximum sample age of 20ms. There
* is no point in sending IPIs in such a case. If the scheduler tick was
* not running then the CPU is either idle or isolated.
*/
#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
unsigned int arch_freq_get_on_cpu(int cpu)
{
struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
unsigned int seq, freq;
unsigned long last;
u64 acnt, mcnt;
if (!cpu_khz)
return 0;
if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
goto fallback;
if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return 0;
do {
seq = raw_read_seqcount_begin(&s->seq);
last = s->last_update;
acnt = s->acnt;
mcnt = s->mcnt;
} while (read_seqcount_retry(&s->seq, seq));
if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
return 0;
/*
* Bail on invalid count and when the last update was too long ago,
* which covers idle and NOHZ full CPUs.
*/
if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
goto fallback;
return div64_u64((cpu_khz * acnt), mcnt);
fallback:
freq = cpufreq_quick_get(cpu);
return freq ? freq : cpu_khz;
}
if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
return per_cpu(samples.khz, cpu);
static int __init bp_init_aperfmperf(void)
{
if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
return 0;
msleep(APERFMPERF_REFRESH_DELAY_MS);
atomic_set(&s->scfpending, 1);
smp_mb(); /* ->scfpending before smp_call_function_single(). */
smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
init_counter_refs();
bp_init_freq_invariance();
return 0;
}
early_initcall(bp_init_aperfmperf);
return per_cpu(samples.khz, cpu);
void ap_init_aperfmperf(void)
{
if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
init_counter_refs();
}
......@@ -84,14 +84,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
unsigned int freq = aperfmperf_get_khz(cpu);
if (!freq)
freq = cpufreq_quick_get(cpu);
if (!freq)
freq = cpu_khz;
seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
freq / 1000, (freq % 1000));
unsigned int freq = arch_freq_get_on_cpu(cpu);
seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
}
/* Cache size */
......
......@@ -341,14 +341,14 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
/* Check whether cpus belong to parent ctrl group */
cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
if (cpumask_weight(tmpmask)) {
if (!cpumask_empty(tmpmask)) {
rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
return -EINVAL;
}
/* Check whether cpus are dropped from this group */
cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
if (cpumask_weight(tmpmask)) {
if (!cpumask_empty(tmpmask)) {
/* Give any dropped cpus to parent rdtgroup */
cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
update_closid_rmid(tmpmask, prgrp);
......@@ -359,7 +359,7 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
* and update per-cpu rmid
*/
cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
if (cpumask_weight(tmpmask)) {
if (!cpumask_empty(tmpmask)) {
head = &prgrp->mon.crdtgrp_list;
list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
if (crgrp == rdtgrp)
......@@ -394,7 +394,7 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
/* Check whether cpus are dropped from this group */
cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
if (cpumask_weight(tmpmask)) {
if (!cpumask_empty(tmpmask)) {
/* Can't drop from default group */
if (rdtgrp == &rdtgroup_default) {
rdt_last_cmd_puts("Can't drop CPUs from default group\n");
......@@ -413,12 +413,12 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
* and update per-cpu closid/rmid.
*/
cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
if (cpumask_weight(tmpmask)) {
if (!cpumask_empty(tmpmask)) {
list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
if (r == rdtgrp)
continue;
cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
if (cpumask_weight(tmpmask1))
if (!cpumask_empty(tmpmask1))
cpumask_rdtgrp_clear(r, tmpmask1);
}
update_closid_rmid(tmpmask, rdtgrp);
......@@ -488,7 +488,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
/* check that user didn't specify any offline cpus */
cpumask_andnot(tmpmask, newmask, cpu_online_mask);
if (cpumask_weight(tmpmask)) {
if (!cpumask_empty(tmpmask)) {
ret = -EINVAL;
rdt_last_cmd_puts("Can only assign online CPUs\n");
goto unlock;
......
......@@ -1687,16 +1687,13 @@ EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
* e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
* XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
*/
long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2)
long fpu_xstate_prctl(int option, unsigned long arg2)
{
u64 __user *uptr = (u64 __user *)arg2;
u64 permitted, supported;
unsigned long idx = arg2;
bool guest = false;
if (tsk != current)
return -EPERM;
switch (option) {
case ARCH_GET_XCOMP_SUPP:
supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features;
......
......@@ -335,7 +335,7 @@ static int get_cpuid_mode(void)
return !test_thread_flag(TIF_NOCPUID);
}
static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled)
static int set_cpuid_mode(unsigned long cpuid_enabled)
{
if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT))
return -ENODEV;
......@@ -406,7 +406,7 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
}
/**
* tss_update_io_bitmap - Update I/O bitmap before exiting to usermode
* native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode
*/
void native_tss_update_io_bitmap(void)
{
......@@ -989,20 +989,19 @@ unsigned long __get_wchan(struct task_struct *p)
return addr;
}
long do_arch_prctl_common(struct task_struct *task, int option,
unsigned long arg2)
long do_arch_prctl_common(int option, unsigned long arg2)
{
switch (option) {
case ARCH_GET_CPUID:
return get_cpuid_mode();
case ARCH_SET_CPUID:
return set_cpuid_mode(task, arg2);
return set_cpuid_mode(arg2);
case ARCH_GET_XCOMP_SUPP:
case ARCH_GET_XCOMP_PERM:
case ARCH_REQ_XCOMP_PERM:
case ARCH_GET_XCOMP_GUEST_PERM:
case ARCH_REQ_XCOMP_GUEST_PERM:
return fpu_xstate_prctl(task, option, arg2);
return fpu_xstate_prctl(option, arg2);
}
return -EINVAL;
......
......@@ -222,5 +222,5 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
{
return do_arch_prctl_common(current, option, arg2);
return do_arch_prctl_common(option, arg2);
}
......@@ -844,7 +844,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
ret = do_arch_prctl_64(current, option, arg2);
if (ret == -EINVAL)
ret = do_arch_prctl_common(current, option, arg2);
ret = do_arch_prctl_common(option, arg2);
return ret;
}
......@@ -852,7 +852,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
#ifdef CONFIG_IA32_EMULATION
COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
{
return do_arch_prctl_common(current, option, arg2);
return do_arch_prctl_common(option, arg2);
}
#endif
......
......@@ -56,7 +56,6 @@
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
#include <linux/syscore_ops.h>
#include <asm/acpi.h>
#include <asm/desc.h>
......@@ -188,7 +187,7 @@ static void smp_callin(void)
*/
set_cpu_sibling_map(raw_smp_processor_id());
init_freq_invariance(true, false);
ap_init_aperfmperf();
/*
* Get our bogomips.
......@@ -1406,7 +1405,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
{
smp_prepare_cpus_common();
init_freq_invariance(false, false);
smp_sanity_check();
switch (apic_intr_mode) {
......@@ -1858,357 +1856,3 @@ void native_play_dead(void)
}
#endif
#ifdef CONFIG_X86_64
/*
* APERF/MPERF frequency ratio computation.
*
* The scheduler wants to do frequency invariant accounting and needs a <1
* ratio to account for the 'current' frequency, corresponding to
* freq_curr / freq_max.
*
* Since the frequency freq_curr on x86 is controlled by micro-controller and
* our P-state setting is little more than a request/hint, we need to observe
* the effective frequency 'BusyMHz', i.e. the average frequency over a time
* interval after discarding idle time. This is given by:
*
* BusyMHz = delta_APERF / delta_MPERF * freq_base
*
* where freq_base is the max non-turbo P-state.
*
* The freq_max term has to be set to a somewhat arbitrary value, because we
* can't know which turbo states will be available at a given point in time:
* it all depends on the thermal headroom of the entire package. We set it to
* the turbo level with 4 cores active.
*
* Benchmarks show that's a good compromise between the 1C turbo ratio
* (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
* which would ignore the entire turbo range (a conspicuous part, making
* freq_curr/freq_max always maxed out).
*
* An exception to the heuristic above is the Atom uarch, where we choose the
* highest turbo level for freq_max since Atom's are generally oriented towards
* power efficiency.
*
* Setting freq_max to anything less than the 1C turbo ratio makes the ratio
* freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
*/
DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
static DEFINE_PER_CPU(u64, arch_prev_aperf);
static DEFINE_PER_CPU(u64, arch_prev_mperf);
static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
void arch_set_max_freq_ratio(bool turbo_disabled)
{
arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
arch_turbo_freq_ratio;
}
EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
static bool turbo_disabled(void)
{
u64 misc_en;
int err;
err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
if (err)
return false;
return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
}
static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{
int err;
err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
if (err)
return false;
err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
if (err)
return false;
*base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
*turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
return true;
}
#define X86_MATCH(model) \
X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
X86_MATCH(XEON_PHI_KNL),
X86_MATCH(XEON_PHI_KNM),
{}
};
static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
X86_MATCH(SKYLAKE_X),
{}
};
static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
X86_MATCH(ATOM_GOLDMONT),
X86_MATCH(ATOM_GOLDMONT_D),
X86_MATCH(ATOM_GOLDMONT_PLUS),
{}
};
static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
int num_delta_fratio)
{
int fratio, delta_fratio, found;
int err, i;
u64 msr;
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
if (err)
return false;
fratio = (msr >> 8) & 0xFF;
i = 16;
found = 0;
do {
if (found >= num_delta_fratio) {
*turbo_freq = fratio;
return true;
}
delta_fratio = (msr >> (i + 5)) & 0x7;
if (delta_fratio) {
found += 1;
fratio -= delta_fratio;
}
i += 8;
} while (i < 64);
return true;
}
static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
{
u64 ratios, counts;
u32 group_size;
int err, i;
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
if (err)
return false;
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
if (err)
return false;
for (i = 0; i < 64; i += 8) {
group_size = (counts >> i) & 0xFF;
if (group_size >= size) {
*turbo_freq = (ratios >> i) & 0xFF;
return true;
}
}
return false;
}
static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{
u64 msr;
int err;
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
if (err)
return false;
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
*turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
/* The CPU may have less than 4 cores */
if (!*turbo_freq)
*turbo_freq = msr & 0xFF; /* 1C turbo */
return true;
}
static bool intel_set_max_freq_ratio(void)
{
u64 base_freq, turbo_freq;
u64 turbo_ratio;
if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
goto out;
if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
goto out;
if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
goto out;
if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
goto out;
if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
goto out;
return false;
out:
/*
* Some hypervisors advertise X86_FEATURE_APERFMPERF
* but then fill all MSR's with zeroes.
* Some CPUs have turbo boost but don't declare any turbo ratio
* in MSR_TURBO_RATIO_LIMIT.
*/
if (!base_freq || !turbo_freq) {
pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
return false;
}
turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
if (!turbo_ratio) {
pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
return false;
}
arch_turbo_freq_ratio = turbo_ratio;
arch_set_max_freq_ratio(turbo_disabled());
return true;
}
static void init_counter_refs(void)
{
u64 aperf, mperf;
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
this_cpu_write(arch_prev_aperf, aperf);
this_cpu_write(arch_prev_mperf, mperf);
}
#ifdef CONFIG_PM_SLEEP
static struct syscore_ops freq_invariance_syscore_ops = {
.resume = init_counter_refs,
};
static void register_freq_invariance_syscore_ops(void)
{
/* Bail out if registered already. */
if (freq_invariance_syscore_ops.node.prev)
return;
register_syscore_ops(&freq_invariance_syscore_ops);
}
#else
static inline void register_freq_invariance_syscore_ops(void) {}
#endif
void init_freq_invariance(bool secondary, bool cppc_ready)
{
bool ret = false;
if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return;
if (secondary) {
if (static_branch_likely(&arch_scale_freq_key)) {
init_counter_refs();
}
return;
}
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
ret = intel_set_max_freq_ratio();
else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
if (!cppc_ready) {
return;
}
ret = amd_set_max_freq_ratio(&arch_turbo_freq_ratio);
}
if (ret) {
init_counter_refs();
static_branch_enable(&arch_scale_freq_key);
register_freq_invariance_syscore_ops();
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
} else {
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
}
}
static void disable_freq_invariance_workfn(struct work_struct *work)
{
static_branch_disable(&arch_scale_freq_key);
}
static DECLARE_WORK(disable_freq_invariance_work,
disable_freq_invariance_workfn);
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
void arch_scale_freq_tick(void)
{
u64 freq_scale;
u64 aperf, mperf;
u64 acnt, mcnt;
if (!arch_scale_freq_invariant())
return;
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
acnt = aperf - this_cpu_read(arch_prev_aperf);
mcnt = mperf - this_cpu_read(arch_prev_mperf);
this_cpu_write(arch_prev_aperf, aperf);
this_cpu_write(arch_prev_mperf, mperf);
if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
goto error;
if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
goto error;
freq_scale = div64_u64(acnt, mcnt);
if (!freq_scale)
goto error;
if (freq_scale > SCHED_CAPACITY_SCALE)
freq_scale = SCHED_CAPACITY_SCALE;
this_cpu_write(arch_freq_scale, freq_scale);
return;
error:
pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
schedule_work(&disable_freq_invariance_work);
}
#endif /* CONFIG_X86_64 */
......@@ -154,7 +154,7 @@ int __init amd_numa_init(void)
node_set(nodeid, numa_nodes_parsed);
}
if (!nodes_weight(numa_nodes_parsed))
if (nodes_empty(numa_nodes_parsed))
return -ENOENT;
/*
......
......@@ -400,7 +400,7 @@ static void leave_uniprocessor(void)
int cpu;
int err;
if (!cpumask_available(downed_cpus) || cpumask_weight(downed_cpus) == 0)
if (!cpumask_available(downed_cpus) || cpumask_empty(downed_cpus))
return;
pr_notice("Re-enabling CPUs...\n");
for_each_cpu(cpu, downed_cpus) {
......
......@@ -123,7 +123,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
* Continue to fill physical nodes with fake nodes until there is no
* memory left on any of them.
*/
while (nodes_weight(physnode_mask)) {
while (!nodes_empty(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
u64 start, limit, end;
......@@ -270,7 +270,7 @@ static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
* Fill physical nodes with fake nodes of size until there is no memory
* left on any of them.
*/
while (nodes_weight(physnode_mask)) {
while (!nodes_empty(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
u64 start, limit, end;
......
......@@ -985,7 +985,7 @@ static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
/* Clear global flags */
if (master) {
if (cpumask_weight(uv_nmi_cpu_mask))
if (!cpumask_empty(uv_nmi_cpu_mask))
uv_nmi_cleanup_mask();
atomic_set(&uv_nmi_cpus_in_nmi, -1);
atomic_set(&uv_nmi_cpu, -1);
......
......@@ -5,14 +5,10 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
__weak void arch_freq_prepare_all(void)
{
}
extern const struct seq_operations cpuinfo_op;
static int cpuinfo_open(struct inode *inode, struct file *file)
{
arch_freq_prepare_all();
return seq_open(file, &cpuinfo_op);
}
......
......@@ -1199,7 +1199,6 @@ static inline void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
struct cpufreq_governor *old_gov) { }
#endif
extern void arch_freq_prepare_all(void);
extern unsigned int arch_freq_get_on_cpu(int cpu);
#ifndef arch_set_freq_scale
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment