Commit 88817acb authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'pm-6.1-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm

Pull power management fixes from Rafael Wysocki:
 "These revert a recent change in the schedutil cpufreq governor that
  had not been expected to make any functional difference, but turned
  out to introduce a performance regression, fix an initialization issue
  in the amd-pstate driver and make it actually replace the venerable
  ACPI cpufreq driver on the supported systems by default.

  Specifics:

   - Revert a recent schedutil cpufreq governor change that introduced a
     performace regression on Pixel 6 (Sam Wu)

   - Fix amd-pstate driver initialization after running the kernel via
     kexec (Wyes Karny)

   - Turn amd-pstate into a built-in driver which allows it to take
     precedence over acpi-cpufreq by default on supported systems and
     amend it with a mechanism to disable this behavior (Perry Yuan)

   - Update amd-pstate documentation in accordance with the other
     changes made to it (Perry Yuan)"

* tag 'pm-6.1-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm:
  Documentation: add amd-pstate kernel command line options
  Documentation: amd-pstate: add driver working mode introduction
  cpufreq: amd-pstate: add amd-pstate driver parameter for mode selection
  cpufreq: amd-pstate: change amd-pstate driver to be built-in type
  cpufreq: amd-pstate: cpufreq: amd-pstate: reset MSR_AMD_PERF_CTL register at init
  Revert "cpufreq: schedutil: Move max CPU capacity to sugov_policy"
parents e3ebac80 1056d314
......@@ -6959,3 +6959,14 @@
memory, and other data can't be written using
xmon commands.
off xmon is disabled.
amd_pstate= [X86]
disable
Do not enable amd_pstate as the default
scaling driver for the supported processors
passive
Use amd_pstate as a scaling driver, driver requests a
desired performance on this abstract scale and the power
management firmware translates the requests into actual
hardware states (core frequency, data fabric and memory
clocks etc.)
......@@ -283,23 +283,19 @@ efficiency frequency management method on AMD processors.
Kernel Module Options for ``amd-pstate``
=========================================
.. _shared_mem:
``shared_mem``
Use a module param (shared_mem) to enable related processors manually with
**amd_pstate.shared_mem=1**.
Due to the performance issue on the processors with `Shared Memory Support
<perf_cap_>`_, we disable it presently and will re-enable this by default
once we address performance issue with this solution.
To check whether the current processor is using `Full MSR Support <perf_cap_>`_
or `Shared Memory Support <perf_cap_>`_ : ::
ray@hr-test1:~$ lscpu | grep cppc
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm
If the CPU flags have ``cppc``, then this processor supports `Full MSR Support
<perf_cap_>`_. Otherwise, it supports `Shared Memory Support <perf_cap_>`_.
Passive Mode
------------
``amd_pstate=passive``
It will be enabled if the ``amd_pstate=passive`` is passed to the kernel in the command line.
In this mode, ``amd_pstate`` driver software specifies a desired QoS target in the CPPC
performance scale as a relative number. This can be expressed as percentage of nominal
performance (infrastructure max). Below the nominal sustained performance level,
desired performance expresses the average performance level of the processor subject
to the Performance Reduction Tolerance register. Above the nominal performance level,
processor must provide at least nominal performance requested and go higher if current
operating conditions allow.
``cpupower`` tool support for ``amd-pstate``
......
......@@ -35,7 +35,7 @@ config X86_PCC_CPUFREQ
If in doubt, say N.
config X86_AMD_PSTATE
tristate "AMD Processor P-State driver"
bool "AMD Processor P-State driver"
depends on X86 && ACPI
select ACPI_PROCESSOR
select ACPI_CPPC_LIB if X86_64
......
......@@ -59,12 +59,8 @@
* we disable it by default to go acpi-cpufreq on these processors and add a
* module parameter to be able to enable it manually for debugging.
*/
static bool shared_mem = false;
module_param(shared_mem, bool, 0444);
MODULE_PARM_DESC(shared_mem,
"enable amd-pstate on processors with shared memory solution (false = disabled (default), true = enabled)");
static struct cpufreq_driver amd_pstate_driver;
static int cppc_load __initdata;
static inline int pstate_enable(bool enable)
{
......@@ -424,12 +420,22 @@ static void amd_pstate_boost_init(struct amd_cpudata *cpudata)
amd_pstate_driver.boost_enabled = true;
}
static void amd_perf_ctl_reset(unsigned int cpu)
{
wrmsrl_on_cpu(cpu, MSR_AMD_PERF_CTL, 0);
}
static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
{
int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret;
struct device *dev;
struct amd_cpudata *cpudata;
/*
* Resetting PERF_CTL_MSR will put the CPU in P0 frequency,
* which is ideal for initialization process.
*/
amd_perf_ctl_reset(policy->cpu);
dev = get_cpu_device(policy->cpu);
if (!dev)
return -ENODEV;
......@@ -616,6 +622,15 @@ static int __init amd_pstate_init(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return -ENODEV;
/*
* by default the pstate driver is disabled to load
* enable the amd_pstate passive mode driver explicitly
* with amd_pstate=passive in kernel command line
*/
if (!cppc_load) {
pr_debug("driver load is disabled, boot with amd_pstate=passive to enable this\n");
return -ENODEV;
}
if (!acpi_cpc_valid()) {
pr_warn_once("the _CPC object is not present in SBIOS or ACPI disabled\n");
......@@ -630,13 +645,11 @@ static int __init amd_pstate_init(void)
if (boot_cpu_has(X86_FEATURE_CPPC)) {
pr_debug("AMD CPPC MSR based functionality is supported\n");
amd_pstate_driver.adjust_perf = amd_pstate_adjust_perf;
} else if (shared_mem) {
} else {
pr_debug("AMD CPPC shared memory based functionality is supported\n");
static_call_update(amd_pstate_enable, cppc_enable);
static_call_update(amd_pstate_init_perf, cppc_init_perf);
static_call_update(amd_pstate_update_perf, cppc_update_perf);
} else {
pr_info("This processor supports shared memory solution, you can enable it with amd_pstate.shared_mem=1\n");
return -ENODEV;
}
/* enable amd pstate feature */
......@@ -653,16 +666,22 @@ static int __init amd_pstate_init(void)
return ret;
}
device_initcall(amd_pstate_init);
static void __exit amd_pstate_exit(void)
static int __init amd_pstate_param(char *str)
{
cpufreq_unregister_driver(&amd_pstate_driver);
if (!str)
return -EINVAL;
amd_pstate_enable(false);
}
if (!strcmp(str, "disable")) {
cppc_load = 0;
pr_info("driver is explicitly disabled\n");
} else if (!strcmp(str, "passive"))
cppc_load = 1;
module_init(amd_pstate_init);
module_exit(amd_pstate_exit);
return 0;
}
early_param("amd_pstate", amd_pstate_param);
MODULE_AUTHOR("Huang Rui <ray.huang@amd.com>");
MODULE_DESCRIPTION("AMD Processor P-state Frequency Driver");
......
......@@ -25,9 +25,6 @@ struct sugov_policy {
unsigned int next_freq;
unsigned int cached_raw_freq;
/* max CPU capacity, which is equal for all CPUs in freq. domain */
unsigned long max;
/* The next fields are only needed if fast switch cannot be used: */
struct irq_work irq_work;
struct kthread_work work;
......@@ -51,6 +48,7 @@ struct sugov_cpu {
unsigned long util;
unsigned long bw_dl;
unsigned long max;
/* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON
......@@ -160,6 +158,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
FREQUENCY_UTIL, NULL);
......@@ -254,7 +253,6 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
*/
static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned long boost;
/* No boost currently required */
......@@ -282,8 +280,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
* sg_cpu->util is already in capacity scale; convert iowait_boost
* into the same scale so we can compare.
*/
boost = sg_cpu->iowait_boost * sg_policy->max;
boost >>= SCHED_CAPACITY_SHIFT;
boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
if (sg_cpu->util < boost)
sg_cpu->util = boost;
......@@ -340,7 +337,7 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
if (!sugov_update_single_common(sg_cpu, time, flags))
return;
next_f = get_next_freq(sg_policy, sg_cpu->util, sg_policy->max);
next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
......@@ -376,7 +373,6 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned long prev_util = sg_cpu->util;
/*
......@@ -403,8 +399,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
sg_cpu->util = prev_util;
cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
map_util_perf(sg_cpu->util),
sg_policy->max);
map_util_perf(sg_cpu->util), sg_cpu->max);
sg_cpu->sg_policy->last_freq_update_time = time;
}
......@@ -413,19 +408,25 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned long util = 0;
unsigned long util = 0, max = 1;
unsigned int j;
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
sugov_get_util(j_sg_cpu);
sugov_iowait_apply(j_sg_cpu, time);
j_util = j_sg_cpu->util;
j_max = j_sg_cpu->max;
util = max(j_sg_cpu->util, util);
if (j_util * max > j_max * util) {
util = j_util;
max = j_max;
}
}
return get_next_freq(sg_policy, util, sg_policy->max);
return get_next_freq(sg_policy, util, max);
}
static void
......@@ -751,7 +752,7 @@ static int sugov_start(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
unsigned int cpu = cpumask_first(policy->cpus);
unsigned int cpu;
sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
sg_policy->last_freq_update_time = 0;
......@@ -759,7 +760,6 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->work_in_progress = false;
sg_policy->limits_changed = false;
sg_policy->cached_raw_freq = 0;
sg_policy->max = arch_scale_cpu_capacity(cpu);
sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment