Commit 4ed39004 authored by Rafael J. Wysocki's avatar Rafael J. Wysocki

Merge branch 'pm-cpufreq'

* pm-cpufreq: (94 commits)
  intel_pstate: Do not skip samples partially
  intel_pstate: Remove freq calculation from intel_pstate_calc_busy()
  intel_pstate: Move intel_pstate_calc_busy() into get_target_pstate_use_performance()
  intel_pstate: Optimize calculation for max/min_perf_adj
  intel_pstate: Remove extra conversions in pid calculation
  cpufreq: Move scheduler-related code to the sched directory
  Revert "cpufreq: postfix policy directory with the first CPU in related_cpus"
  cpufreq: Reduce cpufreq_update_util() overhead a bit
  cpufreq: Select IRQ_WORK if CPU_FREQ_GOV_COMMON is set
  cpufreq: Remove 'policy->governor_enabled'
  cpufreq: Rename __cpufreq_governor() to cpufreq_governor()
  cpufreq: Relocate handle_update() to kill its declaration
  cpufreq: governor: Drop unnecessary checks from show() and store()
  cpufreq: governor: Fix race in dbs_update_util_handler()
  cpufreq: governor: Make gov_set_update_util() static
  cpufreq: governor: Narrow down the dbs_data_mutex coverage
  cpufreq: governor: Make dbs_data_mutex static
  cpufreq: governor: Relocate definitions of tuners structures
  cpufreq: governor: Move per-CPU data to the common code
  cpufreq: governor: Make governor private data per-policy
  ...
parents b5d5fad9 4fec7ad5
......@@ -25,7 +25,7 @@ callback, so cpufreq core can't request a transition to a specific frequency.
The driver provides minimum and maximum frequency limits and callbacks to set a
policy. The policy in cpufreq sysfs is referred to as the "scaling governor".
The cpufreq core can request the driver to operate in any of the two policies:
"performance: and "powersave". The driver decides which frequency to use based
"performance" and "powersave". The driver decides which frequency to use based
on the above policy selection considering minimum and maximum frequency limits.
The Intel P-State driver falls under the latter category, which implements the
......
......@@ -19,6 +19,7 @@ config CPU_FREQ
if CPU_FREQ
config CPU_FREQ_GOV_COMMON
select IRQ_WORK
bool
config CPU_FREQ_BOOST_SW
......
......@@ -70,6 +70,8 @@ struct acpi_cpufreq_data {
unsigned int cpu_feature;
unsigned int acpi_perf_cpu;
cpumask_var_t freqdomain_cpus;
void (*cpu_freq_write)(struct acpi_pct_register *reg, u32 val);
u32 (*cpu_freq_read)(struct acpi_pct_register *reg);
};
/* acpi_perf_data is a pointer to percpu data. */
......@@ -243,125 +245,119 @@ static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
}
}
struct msr_addr {
u32 reg;
};
u32 cpu_freq_read_intel(struct acpi_pct_register *not_used)
{
u32 val, dummy;
struct io_addr {
u16 port;
u8 bit_width;
};
rdmsr(MSR_IA32_PERF_CTL, val, dummy);
return val;
}
void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val)
{
u32 lo, hi;
rdmsr(MSR_IA32_PERF_CTL, lo, hi);
lo = (lo & ~INTEL_MSR_RANGE) | (val & INTEL_MSR_RANGE);
wrmsr(MSR_IA32_PERF_CTL, lo, hi);
}
u32 cpu_freq_read_amd(struct acpi_pct_register *not_used)
{
u32 val, dummy;
rdmsr(MSR_AMD_PERF_CTL, val, dummy);
return val;
}
void cpu_freq_write_amd(struct acpi_pct_register *not_used, u32 val)
{
wrmsr(MSR_AMD_PERF_CTL, val, 0);
}
u32 cpu_freq_read_io(struct acpi_pct_register *reg)
{
u32 val;
acpi_os_read_port(reg->address, &val, reg->bit_width);
return val;
}
void cpu_freq_write_io(struct acpi_pct_register *reg, u32 val)
{
acpi_os_write_port(reg->address, val, reg->bit_width);
}
struct drv_cmd {
unsigned int type;
const struct cpumask *mask;
union {
struct msr_addr msr;
struct io_addr io;
} addr;
struct acpi_pct_register *reg;
u32 val;
union {
void (*write)(struct acpi_pct_register *reg, u32 val);
u32 (*read)(struct acpi_pct_register *reg);
} func;
};
/* Called via smp_call_function_single(), on the target CPU */
static void do_drv_read(void *_cmd)
{
struct drv_cmd *cmd = _cmd;
u32 h;
switch (cmd->type) {
case SYSTEM_INTEL_MSR_CAPABLE:
case SYSTEM_AMD_MSR_CAPABLE:
rdmsr(cmd->addr.msr.reg, cmd->val, h);
break;
case SYSTEM_IO_CAPABLE:
acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
&cmd->val,
(u32)cmd->addr.io.bit_width);
break;
default:
break;
}
cmd->val = cmd->func.read(cmd->reg);
}
/* Called via smp_call_function_many(), on the target CPUs */
static void do_drv_write(void *_cmd)
static u32 drv_read(struct acpi_cpufreq_data *data, const struct cpumask *mask)
{
struct drv_cmd *cmd = _cmd;
u32 lo, hi;
struct acpi_processor_performance *perf = to_perf_data(data);
struct drv_cmd cmd = {
.reg = &perf->control_register,
.func.read = data->cpu_freq_read,
};
int err;
switch (cmd->type) {
case SYSTEM_INTEL_MSR_CAPABLE:
rdmsr(cmd->addr.msr.reg, lo, hi);
lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
wrmsr(cmd->addr.msr.reg, lo, hi);
break;
case SYSTEM_AMD_MSR_CAPABLE:
wrmsr(cmd->addr.msr.reg, cmd->val, 0);
break;
case SYSTEM_IO_CAPABLE:
acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
cmd->val,
(u32)cmd->addr.io.bit_width);
break;
default:
break;
}
err = smp_call_function_any(mask, do_drv_read, &cmd, 1);
WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
return cmd.val;
}
static void drv_read(struct drv_cmd *cmd)
/* Called via smp_call_function_many(), on the target CPUs */
static void do_drv_write(void *_cmd)
{
int err;
cmd->val = 0;
struct drv_cmd *cmd = _cmd;
err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
cmd->func.write(cmd->reg, cmd->val);
}
static void drv_write(struct drv_cmd *cmd)
static void drv_write(struct acpi_cpufreq_data *data,
const struct cpumask *mask, u32 val)
{
struct acpi_processor_performance *perf = to_perf_data(data);
struct drv_cmd cmd = {
.reg = &perf->control_register,
.val = val,
.func.write = data->cpu_freq_write,
};
int this_cpu;
this_cpu = get_cpu();
if (cpumask_test_cpu(this_cpu, cmd->mask))
do_drv_write(cmd);
smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
if (cpumask_test_cpu(this_cpu, mask))
do_drv_write(&cmd);
smp_call_function_many(mask, do_drv_write, &cmd, 1);
put_cpu();
}
static u32
get_cur_val(const struct cpumask *mask, struct acpi_cpufreq_data *data)
static u32 get_cur_val(const struct cpumask *mask, struct acpi_cpufreq_data *data)
{
struct acpi_processor_performance *perf;
struct drv_cmd cmd;
u32 val;
if (unlikely(cpumask_empty(mask)))
return 0;
switch (data->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
break;
case SYSTEM_AMD_MSR_CAPABLE:
cmd.type = SYSTEM_AMD_MSR_CAPABLE;
cmd.addr.msr.reg = MSR_AMD_PERF_CTL;
break;
case SYSTEM_IO_CAPABLE:
cmd.type = SYSTEM_IO_CAPABLE;
perf = to_perf_data(data);
cmd.addr.io.port = perf->control_register.address;
cmd.addr.io.bit_width = perf->control_register.bit_width;
break;
default:
return 0;
}
cmd.mask = mask;
drv_read(&cmd);
val = drv_read(data, mask);
pr_debug("get_cur_val = %u\n", cmd.val);
pr_debug("get_cur_val = %u\n", val);
return cmd.val;
return val;
}
static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
......@@ -416,7 +412,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
{
struct acpi_cpufreq_data *data = policy->driver_data;
struct acpi_processor_performance *perf;
struct drv_cmd cmd;
const struct cpumask *mask;
unsigned int next_perf_state = 0; /* Index into perf table */
int result = 0;
......@@ -434,42 +430,21 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
} else {
pr_debug("Already at target state (P%d)\n",
next_perf_state);
goto out;
}
return 0;
}
switch (data->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
cmd.val = (u32) perf->states[next_perf_state].control;
break;
case SYSTEM_AMD_MSR_CAPABLE:
cmd.type = SYSTEM_AMD_MSR_CAPABLE;
cmd.addr.msr.reg = MSR_AMD_PERF_CTL;
cmd.val = (u32) perf->states[next_perf_state].control;
break;
case SYSTEM_IO_CAPABLE:
cmd.type = SYSTEM_IO_CAPABLE;
cmd.addr.io.port = perf->control_register.address;
cmd.addr.io.bit_width = perf->control_register.bit_width;
cmd.val = (u32) perf->states[next_perf_state].control;
break;
default:
result = -ENODEV;
goto out;
}
/* cpufreq holds the hotplug lock, so we are safe from here on */
if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
cmd.mask = policy->cpus;
else
cmd.mask = cpumask_of(policy->cpu);
/*
* The core won't allow CPUs to go away until the governor has been
* stopped, so we can rely on the stability of policy->cpus.
*/
mask = policy->shared_type == CPUFREQ_SHARED_TYPE_ANY ?
cpumask_of(policy->cpu) : policy->cpus;
drv_write(&cmd);
drv_write(data, mask, perf->states[next_perf_state].control);
if (acpi_pstate_strict) {
if (!check_freqs(cmd.mask, data->freq_table[index].frequency,
if (!check_freqs(mask, data->freq_table[index].frequency,
data)) {
pr_debug("acpi_cpufreq_target failed (%d)\n",
policy->cpu);
......@@ -480,7 +455,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
if (!result)
perf->state = next_perf_state;
out:
return result;
}
......@@ -740,15 +714,21 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
}
pr_debug("SYSTEM IO addr space\n");
data->cpu_feature = SYSTEM_IO_CAPABLE;
data->cpu_freq_read = cpu_freq_read_io;
data->cpu_freq_write = cpu_freq_write_io;
break;
case ACPI_ADR_SPACE_FIXED_HARDWARE:
pr_debug("HARDWARE addr space\n");
if (check_est_cpu(cpu)) {
data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
data->cpu_freq_read = cpu_freq_read_intel;
data->cpu_freq_write = cpu_freq_write_intel;
break;
}
if (check_amd_hwpstate_cpu(cpu)) {
data->cpu_feature = SYSTEM_AMD_MSR_CAPABLE;
data->cpu_freq_read = cpu_freq_read_amd;
data->cpu_freq_write = cpu_freq_write_amd;
break;
}
result = -ENODEV;
......
......@@ -21,7 +21,7 @@
#include <asm/msr.h>
#include <asm/cpufeature.h>
#include "cpufreq_governor.h"
#include "cpufreq_ondemand.h"
#define MSR_AMD64_FREQ_SENSITIVITY_ACTUAL 0xc0010080
#define MSR_AMD64_FREQ_SENSITIVITY_REFERENCE 0xc0010081
......@@ -45,10 +45,10 @@ static unsigned int amd_powersave_bias_target(struct cpufreq_policy *policy,
long d_actual, d_reference;
struct msr actual, reference;
struct cpu_data_t *data = &per_cpu(cpu_data, policy->cpu);
struct dbs_data *od_data = policy->governor_data;
struct policy_dbs_info *policy_dbs = policy->governor_data;
struct dbs_data *od_data = policy_dbs->dbs_data;
struct od_dbs_tuners *od_tuners = od_data->tuners;
struct od_cpu_dbs_info_s *od_info =
od_data->cdata->get_cpu_dbs_info_s(policy->cpu);
struct od_policy_dbs_info *od_info = to_dbs_info(policy_dbs);
if (!od_info->freq_table)
return freq_next;
......
......@@ -31,9 +31,8 @@
struct private_data {
struct device *cpu_dev;
struct regulator *cpu_reg;
struct thermal_cooling_device *cdev;
unsigned int voltage_tolerance; /* in percentage */
const char *reg_name;
};
static struct freq_attr *cpufreq_dt_attr[] = {
......@@ -44,175 +43,128 @@ static struct freq_attr *cpufreq_dt_attr[] = {
static int set_target(struct cpufreq_policy *policy, unsigned int index)
{
struct dev_pm_opp *opp;
struct cpufreq_frequency_table *freq_table = policy->freq_table;
struct clk *cpu_clk = policy->clk;
struct private_data *priv = policy->driver_data;
struct device *cpu_dev = priv->cpu_dev;
struct regulator *cpu_reg = priv->cpu_reg;
unsigned long volt = 0, tol = 0;
int volt_old = 0;
unsigned int old_freq, new_freq;
long freq_Hz, freq_exact;
int ret;
freq_Hz = clk_round_rate(cpu_clk, freq_table[index].frequency * 1000);
if (freq_Hz <= 0)
freq_Hz = freq_table[index].frequency * 1000;
freq_exact = freq_Hz;
new_freq = freq_Hz / 1000;
old_freq = clk_get_rate(cpu_clk) / 1000;
return dev_pm_opp_set_rate(priv->cpu_dev,
policy->freq_table[index].frequency * 1000);
}
if (!IS_ERR(cpu_reg)) {
unsigned long opp_freq;
/*
* An earlier version of opp-v1 bindings used to name the regulator
* "cpu0-supply", we still need to handle that for backwards compatibility.
*/
static const char *find_supply_name(struct device *dev)
{
struct device_node *np;
struct property *pp;
int cpu = dev->id;
const char *name = NULL;
rcu_read_lock();
opp = dev_pm_opp_find_freq_ceil(cpu_dev, &freq_Hz);
if (IS_ERR(opp)) {
rcu_read_unlock();
dev_err(cpu_dev, "failed to find OPP for %ld\n",
freq_Hz);
return PTR_ERR(opp);
}
volt = dev_pm_opp_get_voltage(opp);
opp_freq = dev_pm_opp_get_freq(opp);
rcu_read_unlock();
tol = volt * priv->voltage_tolerance / 100;
volt_old = regulator_get_voltage(cpu_reg);
dev_dbg(cpu_dev, "Found OPP: %ld kHz, %ld uV\n",
opp_freq / 1000, volt);
}
np = of_node_get(dev->of_node);
dev_dbg(cpu_dev, "%u MHz, %d mV --> %u MHz, %ld mV\n",
old_freq / 1000, (volt_old > 0) ? volt_old / 1000 : -1,
new_freq / 1000, volt ? volt / 1000 : -1);
/* This must be valid for sure */
if (WARN_ON(!np))
return NULL;
/* scaling up? scale voltage before frequency */
if (!IS_ERR(cpu_reg) && new_freq > old_freq) {
ret = regulator_set_voltage_tol(cpu_reg, volt, tol);
if (ret) {
dev_err(cpu_dev, "failed to scale voltage up: %d\n",
ret);
return ret;
/* Try "cpu0" for older DTs */
if (!cpu) {
pp = of_find_property(np, "cpu0-supply", NULL);
if (pp) {
name = "cpu0";
goto node_put;
}
}
ret = clk_set_rate(cpu_clk, freq_exact);
if (ret) {
dev_err(cpu_dev, "failed to set clock rate: %d\n", ret);
if (!IS_ERR(cpu_reg) && volt_old > 0)
regulator_set_voltage_tol(cpu_reg, volt_old, tol);
return ret;
pp = of_find_property(np, "cpu-supply", NULL);
if (pp) {
name = "cpu";
goto node_put;
}
/* scaling down? scale voltage after frequency */
if (!IS_ERR(cpu_reg) && new_freq < old_freq) {
ret = regulator_set_voltage_tol(cpu_reg, volt, tol);
if (ret) {
dev_err(cpu_dev, "failed to scale voltage down: %d\n",
ret);
clk_set_rate(cpu_clk, old_freq * 1000);
}
}
return ret;
dev_dbg(dev, "no regulator for cpu%d\n", cpu);
node_put:
of_node_put(np);
return name;
}
static int allocate_resources(int cpu, struct device **cdev,
struct regulator **creg, struct clk **cclk)
static int resources_available(void)
{
struct device *cpu_dev;
struct regulator *cpu_reg;
struct clk *cpu_clk;
int ret = 0;
char *reg_cpu0 = "cpu0", *reg_cpu = "cpu", *reg;
const char *name;
cpu_dev = get_cpu_device(cpu);
cpu_dev = get_cpu_device(0);
if (!cpu_dev) {
pr_err("failed to get cpu%d device\n", cpu);
pr_err("failed to get cpu0 device\n");
return -ENODEV;
}
/* Try "cpu0" for older DTs */
if (!cpu)
reg = reg_cpu0;
else
reg = reg_cpu;
try_again:
cpu_reg = regulator_get_optional(cpu_dev, reg);
ret = PTR_ERR_OR_ZERO(cpu_reg);
cpu_clk = clk_get(cpu_dev, NULL);
ret = PTR_ERR_OR_ZERO(cpu_clk);
if (ret) {
/*
* If cpu's regulator supply node is present, but regulator is
* not yet registered, we should try defering probe.
* If cpu's clk node is present, but clock is not yet
* registered, we should try defering probe.
*/
if (ret == -EPROBE_DEFER) {
dev_dbg(cpu_dev, "cpu%d regulator not ready, retry\n",
cpu);
if (ret == -EPROBE_DEFER)
dev_dbg(cpu_dev, "clock not ready, retry\n");
else
dev_err(cpu_dev, "failed to get clock: %d\n", ret);
return ret;
}
/* Try with "cpu-supply" */
if (reg == reg_cpu0) {
reg = reg_cpu;
goto try_again;
}
clk_put(cpu_clk);
dev_dbg(cpu_dev, "no regulator for cpu%d: %d\n", cpu, ret);
}
name = find_supply_name(cpu_dev);
/* Platform doesn't require regulator */
if (!name)
return 0;
cpu_clk = clk_get(cpu_dev, NULL);
ret = PTR_ERR_OR_ZERO(cpu_clk);
cpu_reg = regulator_get_optional(cpu_dev, name);
ret = PTR_ERR_OR_ZERO(cpu_reg);
if (ret) {
/* put regulator */
if (!IS_ERR(cpu_reg))
regulator_put(cpu_reg);
/*
* If cpu's clk node is present, but clock is not yet
* registered, we should try defering probe.
* If cpu's regulator supply node is present, but regulator is
* not yet registered, we should try defering probe.
*/
if (ret == -EPROBE_DEFER)
dev_dbg(cpu_dev, "cpu%d clock not ready, retry\n", cpu);
dev_dbg(cpu_dev, "cpu0 regulator not ready, retry\n");
else
dev_err(cpu_dev, "failed to get cpu%d clock: %d\n", cpu,
ret);
} else {
*cdev = cpu_dev;
*creg = cpu_reg;
*cclk = cpu_clk;
}
dev_dbg(cpu_dev, "no regulator for cpu0: %d\n", ret);
return ret;
}
regulator_put(cpu_reg);
return 0;
}
static int cpufreq_init(struct cpufreq_policy *policy)
{
struct cpufreq_frequency_table *freq_table;
struct device_node *np;
struct private_data *priv;
struct device *cpu_dev;
struct regulator *cpu_reg;
struct clk *cpu_clk;
struct dev_pm_opp *suspend_opp;
unsigned long min_uV = ~0, max_uV = 0;
unsigned int transition_latency;
bool need_update = false;
bool opp_v1 = false;
const char *name;
int ret;
ret = allocate_resources(policy->cpu, &cpu_dev, &cpu_reg, &cpu_clk);
if (ret) {
pr_err("%s: Failed to allocate resources: %d\n", __func__, ret);
return ret;
cpu_dev = get_cpu_device(policy->cpu);
if (!cpu_dev) {
pr_err("failed to get cpu%d device\n", policy->cpu);
return -ENODEV;
}
np = of_node_get(cpu_dev->of_node);
if (!np) {
dev_err(cpu_dev, "failed to find cpu%d node\n", policy->cpu);
ret = -ENOENT;
goto out_put_reg_clk;
cpu_clk = clk_get(cpu_dev, NULL);
if (IS_ERR(cpu_clk)) {
ret = PTR_ERR(cpu_clk);
dev_err(cpu_dev, "%s: failed to get clk: %d\n", __func__, ret);
return ret;
}
/* Get OPP-sharing information from "operating-points-v2" bindings */
......@@ -223,9 +175,23 @@ static int cpufreq_init(struct cpufreq_policy *policy)
* finding shared-OPPs for backward compatibility.
*/
if (ret == -ENOENT)
need_update = true;
opp_v1 = true;
else
goto out_node_put;
goto out_put_clk;
}
/*
* OPP layer will be taking care of regulators now, but it needs to know
* the name of the regulator first.
*/
name = find_supply_name(cpu_dev);
if (name) {
ret = dev_pm_opp_set_regulator(cpu_dev, name);
if (ret) {
dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n",
policy->cpu, ret);
goto out_put_clk;
}
}
/*
......@@ -246,12 +212,12 @@ static int cpufreq_init(struct cpufreq_policy *policy)
*/
ret = dev_pm_opp_get_opp_count(cpu_dev);
if (ret <= 0) {
pr_debug("OPP table is not ready, deferring probe\n");
dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
ret = -EPROBE_DEFER;
goto out_free_opp;
}
if (need_update) {
if (opp_v1) {
struct cpufreq_dt_platform_data *pd = cpufreq_get_driver_data();
if (!pd || !pd->independent_clocks)
......@@ -265,10 +231,6 @@ static int cpufreq_init(struct cpufreq_policy *policy)
if (ret)
dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
__func__, ret);
of_property_read_u32(np, "clock-latency", &transition_latency);
} else {
transition_latency = dev_pm_opp_get_max_clock_latency(cpu_dev);
}
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
......@@ -277,62 +239,16 @@ static int cpufreq_init(struct cpufreq_policy *policy)
goto out_free_opp;
}
of_property_read_u32(np, "voltage-tolerance", &priv->voltage_tolerance);
if (!transition_latency)
transition_latency = CPUFREQ_ETERNAL;
if (!IS_ERR(cpu_reg)) {
unsigned long opp_freq = 0;
/*
* Disable any OPPs where the connected regulator isn't able to
* provide the specified voltage and record minimum and maximum
* voltage levels.
*/
while (1) {
struct dev_pm_opp *opp;
unsigned long opp_uV, tol_uV;
rcu_read_lock();
opp = dev_pm_opp_find_freq_ceil(cpu_dev, &opp_freq);
if (IS_ERR(opp)) {
rcu_read_unlock();
break;
}
opp_uV = dev_pm_opp_get_voltage(opp);
rcu_read_unlock();
tol_uV = opp_uV * priv->voltage_tolerance / 100;
if (regulator_is_supported_voltage(cpu_reg,
opp_uV - tol_uV,
opp_uV + tol_uV)) {
if (opp_uV < min_uV)
min_uV = opp_uV;
if (opp_uV > max_uV)
max_uV = opp_uV;
} else {
dev_pm_opp_disable(cpu_dev, opp_freq);
}
opp_freq++;
}
ret = regulator_set_voltage_time(cpu_reg, min_uV, max_uV);
if (ret > 0)
transition_latency += ret * 1000;
}
priv->reg_name = name;
ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
if (ret) {
pr_err("failed to init cpufreq table: %d\n", ret);
dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret);
goto out_free_priv;
}
priv->cpu_dev = cpu_dev;
priv->cpu_reg = cpu_reg;
policy->driver_data = priv;
policy->clk = cpu_clk;
rcu_read_lock();
......@@ -357,9 +273,11 @@ static int cpufreq_init(struct cpufreq_policy *policy)
cpufreq_dt_attr[1] = &cpufreq_freq_attr_scaling_boost_freqs;
}
policy->cpuinfo.transition_latency = transition_latency;
transition_latency = dev_pm_opp_get_max_transition_latency(cpu_dev);
if (!transition_latency)
transition_latency = CPUFREQ_ETERNAL;
of_node_put(np);
policy->cpuinfo.transition_latency = transition_latency;
return 0;
......@@ -369,12 +287,10 @@ static int cpufreq_init(struct cpufreq_policy *policy)
kfree(priv);
out_free_opp:
dev_pm_opp_of_cpumask_remove_table(policy->cpus);
out_node_put:
of_node_put(np);
out_put_reg_clk:
if (name)
dev_pm_opp_put_regulator(cpu_dev);
out_put_clk:
clk_put(cpu_clk);
if (!IS_ERR(cpu_reg))
regulator_put(cpu_reg);
return ret;
}
......@@ -386,9 +302,10 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
cpufreq_cooling_unregister(priv->cdev);
dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
if (priv->reg_name)
dev_pm_opp_put_regulator(priv->cpu_dev);
clk_put(policy->clk);
if (!IS_ERR(priv->cpu_reg))
regulator_put(priv->cpu_reg);
kfree(priv);
return 0;
......@@ -441,9 +358,6 @@ static struct cpufreq_driver dt_cpufreq_driver = {
static int dt_cpufreq_probe(struct platform_device *pdev)
{
struct device *cpu_dev;
struct regulator *cpu_reg;
struct clk *cpu_clk;
int ret;
/*
......@@ -453,19 +367,15 @@ static int dt_cpufreq_probe(struct platform_device *pdev)
*
* FIXME: Is checking this only for CPU0 sufficient ?
*/
ret = allocate_resources(0, &cpu_dev, &cpu_reg, &cpu_clk);
ret = resources_available();
if (ret)
return ret;
clk_put(cpu_clk);
if (!IS_ERR(cpu_reg))
regulator_put(cpu_reg);
dt_cpufreq_driver.driver_data = dev_get_platdata(&pdev->dev);
ret = cpufreq_register_driver(&dt_cpufreq_driver);
if (ret)
dev_err(cpu_dev, "failed register driver: %d\n", ret);
dev_err(&pdev->dev, "failed register driver: %d\n", ret);
return ret;
}
......
......@@ -38,48 +38,10 @@ static inline bool policy_is_inactive(struct cpufreq_policy *policy)
return cpumask_empty(policy->cpus);
}
static bool suitable_policy(struct cpufreq_policy *policy, bool active)
{
return active == !policy_is_inactive(policy);
}
/* Finds Next Acive/Inactive policy */
static struct cpufreq_policy *next_policy(struct cpufreq_policy *policy,
bool active)
{
do {
/* No more policies in the list */
if (list_is_last(&policy->policy_list, &cpufreq_policy_list))
return NULL;
policy = list_next_entry(policy, policy_list);
} while (!suitable_policy(policy, active));
return policy;
}
static struct cpufreq_policy *first_policy(bool active)
{
struct cpufreq_policy *policy;
/* No policies in the list */
if (list_empty(&cpufreq_policy_list))
return NULL;
policy = list_first_entry(&cpufreq_policy_list, typeof(*policy),
policy_list);
if (!suitable_policy(policy, active))
policy = next_policy(policy, active);
return policy;
}
/* Macros to iterate over CPU policies */
#define for_each_suitable_policy(__policy, __active) \
for (__policy = first_policy(__active); \
__policy; \
__policy = next_policy(__policy, __active))
list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \
if ((__active) == !policy_is_inactive(__policy))
#define for_each_active_policy(__policy) \
for_each_suitable_policy(__policy, true)
......@@ -102,7 +64,6 @@ static LIST_HEAD(cpufreq_governor_list);
static struct cpufreq_driver *cpufreq_driver;
static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
static DEFINE_RWLOCK(cpufreq_driver_lock);
DEFINE_MUTEX(cpufreq_governor_lock);
/* Flag to suspend/resume CPUFreq governors */
static bool cpufreq_suspended;
......@@ -113,10 +74,8 @@ static inline bool has_target(void)
}
/* internal prototypes */
static int __cpufreq_governor(struct cpufreq_policy *policy,
unsigned int event);
static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event);
static unsigned int __cpufreq_get(struct cpufreq_policy *policy);
static void handle_update(struct work_struct *work);
/**
* Two notifier lists: the "policy" list is involved in the
......@@ -818,12 +777,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
ssize_t ret;
down_read(&policy->rwsem);
if (fattr->show)
ret = fattr->show(policy, buf);
else
ret = -EIO;
up_read(&policy->rwsem);
return ret;
......@@ -838,18 +792,12 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
get_online_cpus();
if (!cpu_online(policy->cpu))
goto unlock;
if (cpu_online(policy->cpu)) {
down_write(&policy->rwsem);
if (fattr->store)
ret = fattr->store(policy, buf, count);
else
ret = -EIO;
up_write(&policy->rwsem);
unlock:
}
put_online_cpus();
return ret;
......@@ -959,6 +907,11 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy)
return cpufreq_add_dev_symlink(policy);
}
__weak struct cpufreq_governor *cpufreq_default_governor(void)
{
return NULL;
}
static int cpufreq_init_policy(struct cpufreq_policy *policy)
{
struct cpufreq_governor *gov = NULL;
......@@ -968,11 +921,14 @@ static int cpufreq_init_policy(struct cpufreq_policy *policy)
/* Update governor of new_policy to the governor used before hotplug */
gov = find_governor(policy->last_governor);
if (gov)
if (gov) {
pr_debug("Restoring governor %s for cpu %d\n",
policy->governor->name, policy->cpu);
else
gov = CPUFREQ_DEFAULT_GOVERNOR;
} else {
gov = cpufreq_default_governor();
if (!gov)
return -ENODATA;
}
new_policy.governor = gov;
......@@ -996,36 +952,45 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
if (cpumask_test_cpu(cpu, policy->cpus))
return 0;
down_write(&policy->rwsem);
if (has_target()) {
ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
if (ret) {
pr_err("%s: Failed to stop governor\n", __func__);
return ret;
goto unlock;
}
}
down_write(&policy->rwsem);
cpumask_set_cpu(cpu, policy->cpus);
up_write(&policy->rwsem);
if (has_target()) {
ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
if (!ret)
ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
if (ret) {
if (ret)
pr_err("%s: Failed to start governor\n", __func__);
return ret;
}
}
return 0;
unlock:
up_write(&policy->rwsem);
return ret;
}
static void handle_update(struct work_struct *work)
{
struct cpufreq_policy *policy =
container_of(work, struct cpufreq_policy, update);
unsigned int cpu = policy->cpu;
pr_debug("handle_update for cpu %u called\n", cpu);
cpufreq_update_policy(cpu);
}
static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
{
struct device *dev = get_cpu_device(cpu);
struct cpufreq_policy *policy;
int ret;
if (WARN_ON(!dev))
return NULL;
......@@ -1043,7 +1008,13 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
if (!zalloc_cpumask_var(&policy->real_cpus, GFP_KERNEL))
goto err_free_rcpumask;
kobject_init(&policy->kobj, &ktype_cpufreq);
ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
cpufreq_global_kobject, "policy%u", cpu);
if (ret) {
pr_err("%s: failed to init policy->kobj: %d\n", __func__, ret);
goto err_free_real_cpus;
}
INIT_LIST_HEAD(&policy->policy_list);
init_rwsem(&policy->rwsem);
spin_lock_init(&policy->transition_lock);
......@@ -1054,6 +1025,8 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
policy->cpu = cpu;
return policy;
err_free_real_cpus:
free_cpumask_var(policy->real_cpus);
err_free_rcpumask:
free_cpumask_var(policy->related_cpus);
err_free_cpumask:
......@@ -1158,16 +1131,6 @@ static int cpufreq_online(unsigned int cpu)
cpumask_copy(policy->related_cpus, policy->cpus);
/* Remember CPUs present at the policy creation time. */
cpumask_and(policy->real_cpus, policy->cpus, cpu_present_mask);
/* Name and add the kobject */
ret = kobject_add(&policy->kobj, cpufreq_global_kobject,
"policy%u",
cpumask_first(policy->related_cpus));
if (ret) {
pr_err("%s: failed to add policy->kobj: %d\n", __func__,
ret);
goto out_exit_policy;
}
}
/*
......@@ -1309,9 +1272,10 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
return ret;
}
static void cpufreq_offline_prepare(unsigned int cpu)
static void cpufreq_offline(unsigned int cpu)
{
struct cpufreq_policy *policy;
int ret;
pr_debug("%s: unregistering CPU %u\n", __func__, cpu);
......@@ -1321,13 +1285,13 @@ static void cpufreq_offline_prepare(unsigned int cpu)
return;
}
down_write(&policy->rwsem);
if (has_target()) {
int ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
if (ret)
pr_err("%s: Failed to stop governor\n", __func__);
}
down_write(&policy->rwsem);
cpumask_clear_cpu(cpu, policy->cpus);
if (policy_is_inactive(policy)) {
......@@ -1340,39 +1304,27 @@ static void cpufreq_offline_prepare(unsigned int cpu)
/* Nominate new CPU */
policy->cpu = cpumask_any(policy->cpus);
}
up_write(&policy->rwsem);
/* Start governor again for active policy */
if (!policy_is_inactive(policy)) {
if (has_target()) {
int ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
if (!ret)
ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
if (ret)
pr_err("%s: Failed to start governor\n", __func__);
}
} else if (cpufreq_driver->stop_cpu) {
cpufreq_driver->stop_cpu(policy);
}
}
static void cpufreq_offline_finish(unsigned int cpu)
{
struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu);
if (!policy) {
pr_debug("%s: No cpu_data found\n", __func__);
return;
goto unlock;
}
/* Only proceed for inactive policies */
if (!policy_is_inactive(policy))
return;
if (cpufreq_driver->stop_cpu)
cpufreq_driver->stop_cpu(policy);
/* If cpu is last user of policy, free policy */
if (has_target()) {
int ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
ret = cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
if (ret)
pr_err("%s: Failed to exit governor\n", __func__);
}
......@@ -1386,6 +1338,9 @@ static void cpufreq_offline_finish(unsigned int cpu)
cpufreq_driver->exit(policy);
policy->freq_table = NULL;
}
unlock:
up_write(&policy->rwsem);
}
/**
......@@ -1401,10 +1356,8 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
if (!policy)
return;
if (cpu_online(cpu)) {
cpufreq_offline_prepare(cpu);
cpufreq_offline_finish(cpu);
}
if (cpu_online(cpu))
cpufreq_offline(cpu);
cpumask_clear_cpu(cpu, policy->real_cpus);
remove_cpu_dev_symlink(policy, cpu);
......@@ -1413,15 +1366,6 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
cpufreq_policy_free(policy, true);
}
static void handle_update(struct work_struct *work)
{
struct cpufreq_policy *policy =
container_of(work, struct cpufreq_policy, update);
unsigned int cpu = policy->cpu;
pr_debug("handle_update for cpu %u called\n", cpu);
cpufreq_update_policy(cpu);
}
/**
* cpufreq_out_of_sync - If actual and saved CPU frequency differs, we're
* in deep trouble.
......@@ -1584,6 +1528,7 @@ EXPORT_SYMBOL(cpufreq_generic_suspend);
void cpufreq_suspend(void)
{
struct cpufreq_policy *policy;
int ret;
if (!cpufreq_driver)
return;
......@@ -1594,7 +1539,11 @@ void cpufreq_suspend(void)
pr_debug("%s: Suspending Governors\n", __func__);
for_each_active_policy(policy) {
if (__cpufreq_governor(policy, CPUFREQ_GOV_STOP))
down_write(&policy->rwsem);
ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
up_write(&policy->rwsem);
if (ret)
pr_err("%s: Failed to stop governor for policy: %p\n",
__func__, policy);
else if (cpufreq_driver->suspend
......@@ -1616,6 +1565,7 @@ void cpufreq_suspend(void)
void cpufreq_resume(void)
{
struct cpufreq_policy *policy;
int ret;
if (!cpufreq_driver)
return;
......@@ -1628,14 +1578,21 @@ void cpufreq_resume(void)
pr_debug("%s: Resuming Governors\n", __func__);
for_each_active_policy(policy) {
if (cpufreq_driver->resume && cpufreq_driver->resume(policy))
if (cpufreq_driver->resume && cpufreq_driver->resume(policy)) {
pr_err("%s: Failed to resume driver: %p\n", __func__,
policy);
else if (__cpufreq_governor(policy, CPUFREQ_GOV_START)
|| __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS))
} else {
down_write(&policy->rwsem);
ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
if (!ret)
cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
up_write(&policy->rwsem);
if (ret)
pr_err("%s: Failed to start governor for policy: %p\n",
__func__, policy);
}
}
/*
* schedule call cpufreq_update_policy() for first-online CPU, as that
......@@ -1846,7 +1803,8 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
unsigned int relation)
{
unsigned int old_target_freq = target_freq;
int retval = -EINVAL;
struct cpufreq_frequency_table *freq_table;
int index, retval;
if (cpufreq_disabled())
return -ENODEV;
......@@ -1873,34 +1831,28 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
policy->restore_freq = policy->cur;
if (cpufreq_driver->target)
retval = cpufreq_driver->target(policy, target_freq, relation);
else if (cpufreq_driver->target_index) {
struct cpufreq_frequency_table *freq_table;
int index;
return cpufreq_driver->target(policy, target_freq, relation);
if (!cpufreq_driver->target_index)
return -EINVAL;
freq_table = cpufreq_frequency_get_table(policy->cpu);
if (unlikely(!freq_table)) {
pr_err("%s: Unable to find freq_table\n", __func__);
goto out;
return -EINVAL;
}
retval = cpufreq_frequency_table_target(policy, freq_table,
target_freq, relation, &index);
retval = cpufreq_frequency_table_target(policy, freq_table, target_freq,
relation, &index);
if (unlikely(retval)) {
pr_err("%s: Unable to find matching freq\n", __func__);
goto out;
}
if (freq_table[index].frequency == policy->cur) {
retval = 0;
goto out;
return retval;
}
retval = __target_index(policy, freq_table, index);
}
if (freq_table[index].frequency == policy->cur)
return 0;
out:
return retval;
return __target_index(policy, freq_table, index);
}
EXPORT_SYMBOL_GPL(__cpufreq_driver_target);
......@@ -1920,20 +1872,14 @@ int cpufreq_driver_target(struct cpufreq_policy *policy,
}
EXPORT_SYMBOL_GPL(cpufreq_driver_target);
static int __cpufreq_governor(struct cpufreq_policy *policy,
unsigned int event)
__weak struct cpufreq_governor *cpufreq_fallback_governor(void)
{
int ret;
return NULL;
}
/* Only must be defined when default governor is known to have latency
restrictions, like e.g. conservative or ondemand.
That this is the case is already ensured in Kconfig
*/
#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE
struct cpufreq_governor *gov = &cpufreq_gov_performance;
#else
struct cpufreq_governor *gov = NULL;
#endif
static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event)
{
int ret;
/* Don't start any governor operations if we are entering suspend */
if (cpufreq_suspended)
......@@ -1948,12 +1894,14 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
if (policy->governor->max_transition_latency &&
policy->cpuinfo.transition_latency >
policy->governor->max_transition_latency) {
if (!gov)
return -EINVAL;
else {
struct cpufreq_governor *gov = cpufreq_fallback_governor();
if (gov) {
pr_warn("%s governor failed, too long transition latency of HW, fallback to %s governor\n",
policy->governor->name, gov->name);
policy->governor = gov;
} else {
return -EINVAL;
}
}
......@@ -1963,21 +1911,6 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
pr_debug("%s: for CPU %u, event %u\n", __func__, policy->cpu, event);
mutex_lock(&cpufreq_governor_lock);
if ((policy->governor_enabled && event == CPUFREQ_GOV_START)
|| (!policy->governor_enabled
&& (event == CPUFREQ_GOV_LIMITS || event == CPUFREQ_GOV_STOP))) {
mutex_unlock(&cpufreq_governor_lock);
return -EBUSY;
}
if (event == CPUFREQ_GOV_STOP)
policy->governor_enabled = false;
else if (event == CPUFREQ_GOV_START)
policy->governor_enabled = true;
mutex_unlock(&cpufreq_governor_lock);
ret = policy->governor->governor(policy, event);
if (!ret) {
......@@ -1985,14 +1918,6 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
policy->governor->initialized++;
else if (event == CPUFREQ_GOV_POLICY_EXIT)
policy->governor->initialized--;
} else {
/* Restore original values */
mutex_lock(&cpufreq_governor_lock);
if (event == CPUFREQ_GOV_STOP)
policy->governor_enabled = true;
else if (event == CPUFREQ_GOV_START)
policy->governor_enabled = false;
mutex_unlock(&cpufreq_governor_lock);
}
if (((event == CPUFREQ_GOV_POLICY_INIT) && ret) ||
......@@ -2147,7 +2072,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
old_gov = policy->governor;
/* end old governor */
if (old_gov) {
ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
if (ret) {
/* This can happen due to race with other operations */
pr_debug("%s: Failed to Stop Governor: %s (%d)\n",
......@@ -2155,10 +2080,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
return ret;
}
up_write(&policy->rwsem);
ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
down_write(&policy->rwsem);
ret = cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
if (ret) {
pr_err("%s: Failed to Exit Governor: %s (%d)\n",
__func__, old_gov->name, ret);
......@@ -2168,32 +2090,30 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
/* start new governor */
policy->governor = new_policy->governor;
ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT);
ret = cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT);
if (!ret) {
ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
if (!ret)
goto out;
up_write(&policy->rwsem);
__cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
down_write(&policy->rwsem);
cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
}
/* new governor failed, so re-start old one */
pr_debug("starting governor %s failed\n", policy->governor->name);
if (old_gov) {
policy->governor = old_gov;
if (__cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT))
if (cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT))
policy->governor = NULL;
else
__cpufreq_governor(policy, CPUFREQ_GOV_START);
cpufreq_governor(policy, CPUFREQ_GOV_START);
}
return ret;
out:
pr_debug("governor: change or update limits\n");
return __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
return cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
}
/**
......@@ -2260,11 +2180,7 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
break;
case CPU_DOWN_PREPARE:
cpufreq_offline_prepare(cpu);
break;
case CPU_POST_DEAD:
cpufreq_offline_finish(cpu);
cpufreq_offline(cpu);
break;
case CPU_DOWN_FAILED:
......@@ -2297,8 +2213,11 @@ static int cpufreq_boost_set_sw(int state)
__func__);
break;
}
down_write(&policy->rwsem);
policy->user_policy.max = policy->max;
__cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
up_write(&policy->rwsem);
}
}
......@@ -2384,7 +2303,7 @@ EXPORT_SYMBOL_GPL(cpufreq_boost_enabled);
* submitted by the CPU Frequency driver.
*
* Registers a CPU Frequency driver to this core code. This code
* returns zero on success, -EBUSY when another driver got here first
* returns zero on success, -EEXIST when another driver got here first
* (and isn't unregistered in the meantime).
*
*/
......
......@@ -14,6 +14,22 @@
#include <linux/slab.h>
#include "cpufreq_governor.h"
struct cs_policy_dbs_info {
struct policy_dbs_info policy_dbs;
unsigned int down_skip;
unsigned int requested_freq;
};
static inline struct cs_policy_dbs_info *to_dbs_info(struct policy_dbs_info *policy_dbs)
{
return container_of(policy_dbs, struct cs_policy_dbs_info, policy_dbs);
}
struct cs_dbs_tuners {
unsigned int down_threshold;
unsigned int freq_step;
};
/* Conservative governor macros */
#define DEF_FREQUENCY_UP_THRESHOLD (80)
#define DEF_FREQUENCY_DOWN_THRESHOLD (20)
......@@ -21,21 +37,6 @@
#define DEF_SAMPLING_DOWN_FACTOR (1)
#define MAX_SAMPLING_DOWN_FACTOR (10)
static DEFINE_PER_CPU(struct cs_cpu_dbs_info_s, cs_cpu_dbs_info);
static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy,
unsigned int event);
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
static
#endif
struct cpufreq_governor cpufreq_gov_conservative = {
.name = "conservative",
.governor = cs_cpufreq_governor_dbs,
.max_transition_latency = TRANSITION_LATENCY_LIMIT,
.owner = THIS_MODULE,
};
static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
struct cpufreq_policy *policy)
{
......@@ -57,27 +58,28 @@ static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
* Any frequency increase takes it to the maximum frequency. Frequency reduction
* happens at minimum steps of 5% (default) of maximum frequency
*/
static void cs_check_cpu(int cpu, unsigned int load)
static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
{
struct cs_cpu_dbs_info_s *dbs_info = &per_cpu(cs_cpu_dbs_info, cpu);
struct cpufreq_policy *policy = dbs_info->cdbs.shared->policy;
struct dbs_data *dbs_data = policy->governor_data;
struct policy_dbs_info *policy_dbs = policy->governor_data;
struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
struct dbs_data *dbs_data = policy_dbs->dbs_data;
struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
unsigned int load = dbs_update(policy);
/*
* break out if we 'cannot' reduce the speed as the user might
* want freq_step to be zero
*/
if (cs_tuners->freq_step == 0)
return;
goto out;
/* Check for frequency increase */
if (load > cs_tuners->up_threshold) {
if (load > dbs_data->up_threshold) {
dbs_info->down_skip = 0;
/* if we are already at full speed then break out early */
if (dbs_info->requested_freq == policy->max)
return;
goto out;
dbs_info->requested_freq += get_freq_target(cs_tuners, policy);
......@@ -86,12 +88,12 @@ static void cs_check_cpu(int cpu, unsigned int load)
__cpufreq_driver_target(policy, dbs_info->requested_freq,
CPUFREQ_RELATION_H);
return;
goto out;
}
/* if sampling_down_factor is active break out early */
if (++dbs_info->down_skip < cs_tuners->sampling_down_factor)
return;
if (++dbs_info->down_skip < dbs_data->sampling_down_factor)
goto out;
dbs_info->down_skip = 0;
/* Check for frequency decrease */
......@@ -101,7 +103,7 @@ static void cs_check_cpu(int cpu, unsigned int load)
* if we cannot reduce the frequency anymore, break out early
*/
if (policy->cur == policy->min)
return;
goto out;
freq_target = get_freq_target(cs_tuners, policy);
if (dbs_info->requested_freq > freq_target)
......@@ -111,58 +113,25 @@ static void cs_check_cpu(int cpu, unsigned int load)
__cpufreq_driver_target(policy, dbs_info->requested_freq,
CPUFREQ_RELATION_L);
return;
}
}
static unsigned int cs_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
{
struct dbs_data *dbs_data = policy->governor_data;
struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
if (modify_all)
dbs_check_cpu(dbs_data, policy->cpu);
return delay_for_sampling_rate(cs_tuners->sampling_rate);
out:
return dbs_data->sampling_rate;
}
static int dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct cpufreq_freqs *freq = data;
struct cs_cpu_dbs_info_s *dbs_info =
&per_cpu(cs_cpu_dbs_info, freq->cpu);
struct cpufreq_policy *policy = cpufreq_cpu_get_raw(freq->cpu);
if (!policy)
return 0;
/* policy isn't governed by conservative governor */
if (policy->governor != &cpufreq_gov_conservative)
return 0;
/*
* we only care if our internally tracked freq moves outside the 'valid'
* ranges of frequency available to us otherwise we do not change it
*/
if (dbs_info->requested_freq > policy->max
|| dbs_info->requested_freq < policy->min)
dbs_info->requested_freq = freq->new;
return 0;
}
void *data);
static struct notifier_block cs_cpufreq_notifier_block = {
.notifier_call = dbs_cpufreq_notifier,
};
/************************** sysfs interface ************************/
static struct common_dbs_data cs_dbs_cdata;
static struct dbs_governor cs_dbs_gov;
static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
const char *buf, size_t count)
{
struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
unsigned int input;
int ret;
ret = sscanf(buf, "%u", &input);
......@@ -170,22 +139,7 @@ static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1)
return -EINVAL;
cs_tuners->sampling_down_factor = input;
return count;
}
static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
size_t count)
{
struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
unsigned int input;
int ret;
ret = sscanf(buf, "%u", &input);
if (ret != 1)
return -EINVAL;
cs_tuners->sampling_rate = max(input, dbs_data->min_sampling_rate);
dbs_data->sampling_down_factor = input;
return count;
}
......@@ -200,7 +154,7 @@ static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
if (ret != 1 || input > 100 || input <= cs_tuners->down_threshold)
return -EINVAL;
cs_tuners->up_threshold = input;
dbs_data->up_threshold = input;
return count;
}
......@@ -214,7 +168,7 @@ static ssize_t store_down_threshold(struct dbs_data *dbs_data, const char *buf,
/* cannot be lower than 11 otherwise freq will not fall */
if (ret != 1 || input < 11 || input > 100 ||
input >= cs_tuners->up_threshold)
input >= dbs_data->up_threshold)
return -EINVAL;
cs_tuners->down_threshold = input;
......@@ -224,8 +178,7 @@ static ssize_t store_down_threshold(struct dbs_data *dbs_data, const char *buf,
static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
const char *buf, size_t count)
{
struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
unsigned int input, j;
unsigned int input;
int ret;
ret = sscanf(buf, "%u", &input);
......@@ -235,21 +188,14 @@ static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
if (input > 1)
input = 1;
if (input == cs_tuners->ignore_nice_load) /* nothing to do */
if (input == dbs_data->ignore_nice_load) /* nothing to do */
return count;
cs_tuners->ignore_nice_load = input;
dbs_data->ignore_nice_load = input;
/* we need to re-evaluate prev_cpu_idle */
for_each_online_cpu(j) {
struct cs_cpu_dbs_info_s *dbs_info;
dbs_info = &per_cpu(cs_cpu_dbs_info, j);
dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
&dbs_info->cdbs.prev_cpu_wall, 0);
if (cs_tuners->ignore_nice_load)
dbs_info->cdbs.prev_cpu_nice =
kcpustat_cpu(j).cpustat[CPUTIME_NICE];
}
gov_update_cpu_data(dbs_data);
return count;
}
......@@ -275,55 +221,47 @@ static ssize_t store_freq_step(struct dbs_data *dbs_data, const char *buf,
return count;
}
show_store_one(cs, sampling_rate);
show_store_one(cs, sampling_down_factor);
show_store_one(cs, up_threshold);
show_store_one(cs, down_threshold);
show_store_one(cs, ignore_nice_load);
show_store_one(cs, freq_step);
declare_show_sampling_rate_min(cs);
gov_sys_pol_attr_rw(sampling_rate);
gov_sys_pol_attr_rw(sampling_down_factor);
gov_sys_pol_attr_rw(up_threshold);
gov_sys_pol_attr_rw(down_threshold);
gov_sys_pol_attr_rw(ignore_nice_load);
gov_sys_pol_attr_rw(freq_step);
gov_sys_pol_attr_ro(sampling_rate_min);
static struct attribute *dbs_attributes_gov_sys[] = {
&sampling_rate_min_gov_sys.attr,
&sampling_rate_gov_sys.attr,
&sampling_down_factor_gov_sys.attr,
&up_threshold_gov_sys.attr,
&down_threshold_gov_sys.attr,
&ignore_nice_load_gov_sys.attr,
&freq_step_gov_sys.attr,
gov_show_one_common(sampling_rate);
gov_show_one_common(sampling_down_factor);
gov_show_one_common(up_threshold);
gov_show_one_common(ignore_nice_load);
gov_show_one_common(min_sampling_rate);
gov_show_one(cs, down_threshold);
gov_show_one(cs, freq_step);
gov_attr_rw(sampling_rate);
gov_attr_rw(sampling_down_factor);
gov_attr_rw(up_threshold);
gov_attr_rw(ignore_nice_load);
gov_attr_ro(min_sampling_rate);
gov_attr_rw(down_threshold);
gov_attr_rw(freq_step);
static struct attribute *cs_attributes[] = {
&min_sampling_rate.attr,
&sampling_rate.attr,
&sampling_down_factor.attr,
&up_threshold.attr,
&down_threshold.attr,
&ignore_nice_load.attr,
&freq_step.attr,
NULL
};
static struct attribute_group cs_attr_group_gov_sys = {
.attrs = dbs_attributes_gov_sys,
.name = "conservative",
};
/************************** sysfs end ************************/
static struct attribute *dbs_attributes_gov_pol[] = {
&sampling_rate_min_gov_pol.attr,
&sampling_rate_gov_pol.attr,
&sampling_down_factor_gov_pol.attr,
&up_threshold_gov_pol.attr,
&down_threshold_gov_pol.attr,
&ignore_nice_load_gov_pol.attr,
&freq_step_gov_pol.attr,
NULL
};
static struct policy_dbs_info *cs_alloc(void)
{
struct cs_policy_dbs_info *dbs_info;
static struct attribute_group cs_attr_group_gov_pol = {
.attrs = dbs_attributes_gov_pol,
.name = "conservative",
};
dbs_info = kzalloc(sizeof(*dbs_info), GFP_KERNEL);
return dbs_info ? &dbs_info->policy_dbs : NULL;
}
/************************** sysfs end ************************/
static void cs_free(struct policy_dbs_info *policy_dbs)
{
kfree(to_dbs_info(policy_dbs));
}
static int cs_init(struct dbs_data *dbs_data, bool notify)
{
......@@ -335,11 +273,11 @@ static int cs_init(struct dbs_data *dbs_data, bool notify)
return -ENOMEM;
}
tuners->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
tuners->down_threshold = DEF_FREQUENCY_DOWN_THRESHOLD;
tuners->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
tuners->ignore_nice_load = 0;
tuners->freq_step = DEF_FREQUENCY_STEP;
dbs_data->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
dbs_data->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
dbs_data->ignore_nice_load = 0;
dbs_data->tuners = tuners;
dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
......@@ -361,35 +299,66 @@ static void cs_exit(struct dbs_data *dbs_data, bool notify)
kfree(dbs_data->tuners);
}
define_get_cpu_dbs_routines(cs_cpu_dbs_info);
static void cs_start(struct cpufreq_policy *policy)
{
struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy->governor_data);
dbs_info->down_skip = 0;
dbs_info->requested_freq = policy->cur;
}
static struct common_dbs_data cs_dbs_cdata = {
.governor = GOV_CONSERVATIVE,
.attr_group_gov_sys = &cs_attr_group_gov_sys,
.attr_group_gov_pol = &cs_attr_group_gov_pol,
.get_cpu_cdbs = get_cpu_cdbs,
.get_cpu_dbs_info_s = get_cpu_dbs_info_s,
static struct dbs_governor cs_dbs_gov = {
.gov = {
.name = "conservative",
.governor = cpufreq_governor_dbs,
.max_transition_latency = TRANSITION_LATENCY_LIMIT,
.owner = THIS_MODULE,
},
.kobj_type = { .default_attrs = cs_attributes },
.gov_dbs_timer = cs_dbs_timer,
.gov_check_cpu = cs_check_cpu,
.alloc = cs_alloc,
.free = cs_free,
.init = cs_init,
.exit = cs_exit,
.mutex = __MUTEX_INITIALIZER(cs_dbs_cdata.mutex),
.start = cs_start,
};
static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy,
unsigned int event)
#define CPU_FREQ_GOV_CONSERVATIVE (&cs_dbs_gov.gov)
static int dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
return cpufreq_governor_dbs(policy, &cs_dbs_cdata, event);
struct cpufreq_freqs *freq = data;
struct cpufreq_policy *policy = cpufreq_cpu_get_raw(freq->cpu);
struct cs_policy_dbs_info *dbs_info;
if (!policy)
return 0;
/* policy isn't governed by conservative governor */
if (policy->governor != CPU_FREQ_GOV_CONSERVATIVE)
return 0;
dbs_info = to_dbs_info(policy->governor_data);
/*
* we only care if our internally tracked freq moves outside the 'valid'
* ranges of frequency available to us otherwise we do not change it
*/
if (dbs_info->requested_freq > policy->max
|| dbs_info->requested_freq < policy->min)
dbs_info->requested_freq = freq->new;
return 0;
}
static int __init cpufreq_gov_dbs_init(void)
{
return cpufreq_register_governor(&cpufreq_gov_conservative);
return cpufreq_register_governor(CPU_FREQ_GOV_CONSERVATIVE);
}
static void __exit cpufreq_gov_dbs_exit(void)
{
cpufreq_unregister_governor(&cpufreq_gov_conservative);
cpufreq_unregister_governor(CPU_FREQ_GOV_CONSERVATIVE);
}
MODULE_AUTHOR("Alexander Clouter <alex@digriz.org.uk>");
......@@ -399,6 +368,11 @@ MODULE_DESCRIPTION("'cpufreq_conservative' - A dynamic cpufreq governor for "
MODULE_LICENSE("GPL");
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
struct cpufreq_governor *cpufreq_default_governor(void)
{
return CPU_FREQ_GOV_CONSERVATIVE;
}
fs_initcall(cpufreq_gov_dbs_init);
#else
module_init(cpufreq_gov_dbs_init);
......
......@@ -18,95 +18,193 @@
#include <linux/export.h>
#include <linux/kernel_stat.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include "cpufreq_governor.h"
static struct attribute_group *get_sysfs_attr(struct dbs_data *dbs_data)
static DEFINE_PER_CPU(struct cpu_dbs_info, cpu_dbs);
static DEFINE_MUTEX(gov_dbs_data_mutex);
/* Common sysfs tunables */
/**
* store_sampling_rate - update sampling rate effective immediately if needed.
*
* If new rate is smaller than the old, simply updating
* dbs.sampling_rate might not be appropriate. For example, if the
* original sampling_rate was 1 second and the requested new sampling rate is 10
* ms because the user needs immediate reaction from ondemand governor, but not
* sure if higher frequency will be required or not, then, the governor may
* change the sampling rate too late; up to 1 second later. Thus, if we are
* reducing the sampling rate, we need to make the new value effective
* immediately.
*
* This must be called with dbs_data->mutex held, otherwise traversing
* policy_dbs_list isn't safe.
*/
ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
size_t count)
{
if (have_governor_per_policy())
return dbs_data->cdata->attr_group_gov_pol;
else
return dbs_data->cdata->attr_group_gov_sys;
struct policy_dbs_info *policy_dbs;
unsigned int rate;
int ret;
ret = sscanf(buf, "%u", &rate);
if (ret != 1)
return -EINVAL;
dbs_data->sampling_rate = max(rate, dbs_data->min_sampling_rate);
/*
* We are operating under dbs_data->mutex and so the list and its
* entries can't be freed concurrently.
*/
list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list) {
mutex_lock(&policy_dbs->timer_mutex);
/*
* On 32-bit architectures this may race with the
* sample_delay_ns read in dbs_update_util_handler(), but that
* really doesn't matter. If the read returns a value that's
* too big, the sample will be skipped, but the next invocation
* of dbs_update_util_handler() (when the update has been
* completed) will take a sample.
*
* If this runs in parallel with dbs_work_handler(), we may end
* up overwriting the sample_delay_ns value that it has just
* written, but it will be corrected next time a sample is
* taken, so it shouldn't be significant.
*/
gov_update_sample_delay(policy_dbs, 0);
mutex_unlock(&policy_dbs->timer_mutex);
}
return count;
}
EXPORT_SYMBOL_GPL(store_sampling_rate);
void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
/**
* gov_update_cpu_data - Update CPU load data.
* @dbs_data: Top-level governor data pointer.
*
* Update CPU load data for all CPUs in the domain governed by @dbs_data
* (that may be a single policy or a bunch of them if governor tunables are
* system-wide).
*
* Call under the @dbs_data mutex.
*/
void gov_update_cpu_data(struct dbs_data *dbs_data)
{
struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
struct cpufreq_policy *policy = cdbs->shared->policy;
unsigned int sampling_rate;
unsigned int max_load = 0;
unsigned int ignore_nice;
struct policy_dbs_info *policy_dbs;
list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list) {
unsigned int j;
if (dbs_data->cdata->governor == GOV_ONDEMAND) {
struct od_cpu_dbs_info_s *od_dbs_info =
dbs_data->cdata->get_cpu_dbs_info_s(cpu);
for_each_cpu(j, policy_dbs->policy->cpus) {
struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
/*
* Sometimes, the ondemand governor uses an additional
* multiplier to give long delays. So apply this multiplier to
* the 'sampling_rate', so as to keep the wake-up-from-idle
* detection logic a bit conservative.
j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall,
dbs_data->io_is_busy);
if (dbs_data->ignore_nice_load)
j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
}
}
}
EXPORT_SYMBOL_GPL(gov_update_cpu_data);
static inline struct dbs_data *to_dbs_data(struct kobject *kobj)
{
return container_of(kobj, struct dbs_data, kobj);
}
static inline struct governor_attr *to_gov_attr(struct attribute *attr)
{
return container_of(attr, struct governor_attr, attr);
}
static ssize_t governor_show(struct kobject *kobj, struct attribute *attr,
char *buf)
{
struct dbs_data *dbs_data = to_dbs_data(kobj);
struct governor_attr *gattr = to_gov_attr(attr);
return gattr->show(dbs_data, buf);
}
static ssize_t governor_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(kobj);
struct governor_attr *gattr = to_gov_attr(attr);
int ret = -EBUSY;
mutex_lock(&dbs_data->mutex);
if (dbs_data->usage_count)
ret = gattr->store(dbs_data, buf, count);
mutex_unlock(&dbs_data->mutex);
return ret;
}
/*
* Sysfs Ops for accessing governor attributes.
*
* All show/store invocations for governor specific sysfs attributes, will first
* call the below show/store callbacks and the attribute specific callback will
* be called from within it.
*/
sampling_rate = od_tuners->sampling_rate;
sampling_rate *= od_dbs_info->rate_mult;
static const struct sysfs_ops governor_sysfs_ops = {
.show = governor_show,
.store = governor_store,
};
ignore_nice = od_tuners->ignore_nice_load;
} else {
sampling_rate = cs_tuners->sampling_rate;
ignore_nice = cs_tuners->ignore_nice_load;
}
unsigned int dbs_update(struct cpufreq_policy *policy)
{
struct policy_dbs_info *policy_dbs = policy->governor_data;
struct dbs_data *dbs_data = policy_dbs->dbs_data;
unsigned int ignore_nice = dbs_data->ignore_nice_load;
unsigned int max_load = 0;
unsigned int sampling_rate, io_busy, j;
/*
* Sometimes governors may use an additional multiplier to increase
* sample delays temporarily. Apply that multiplier to sampling_rate
* so as to keep the wake-up-from-idle detection logic a bit
* conservative.
*/
sampling_rate = dbs_data->sampling_rate * policy_dbs->rate_mult;
/*
* For the purpose of ondemand, waiting for disk IO is an indication
* that you're performance critical, and not that the system is actually
* idle, so do not add the iowait time to the CPU idle time then.
*/
io_busy = dbs_data->io_is_busy;
/* Get Absolute Load */
for_each_cpu(j, policy->cpus) {
struct cpu_dbs_info *j_cdbs;
struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
u64 cur_wall_time, cur_idle_time;
unsigned int idle_time, wall_time;
unsigned int load;
int io_busy = 0;
j_cdbs = dbs_data->cdata->get_cpu_cdbs(j);
/*
* For the purpose of ondemand, waiting for disk IO is
* an indication that you're performance critical, and
* not that the system is actually idle. So do not add
* the iowait time to the cpu idle time.
*/
if (dbs_data->cdata->governor == GOV_ONDEMAND)
io_busy = od_tuners->io_is_busy;
cur_idle_time = get_cpu_idle_time(j, &cur_wall_time, io_busy);
wall_time = (unsigned int)
(cur_wall_time - j_cdbs->prev_cpu_wall);
wall_time = cur_wall_time - j_cdbs->prev_cpu_wall;
j_cdbs->prev_cpu_wall = cur_wall_time;
if (cur_idle_time < j_cdbs->prev_cpu_idle)
cur_idle_time = j_cdbs->prev_cpu_idle;
idle_time = (unsigned int)
(cur_idle_time - j_cdbs->prev_cpu_idle);
if (cur_idle_time <= j_cdbs->prev_cpu_idle) {
idle_time = 0;
} else {
idle_time = cur_idle_time - j_cdbs->prev_cpu_idle;
j_cdbs->prev_cpu_idle = cur_idle_time;
}
if (ignore_nice) {
u64 cur_nice;
unsigned long cur_nice_jiffies;
u64 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] -
cdbs->prev_cpu_nice;
/*
* Assumption: nice time between sampling periods will
* be less than 2^32 jiffies for 32 bit sys
*/
cur_nice_jiffies = (unsigned long)
cputime64_to_jiffies64(cur_nice);
cdbs->prev_cpu_nice =
kcpustat_cpu(j).cpustat[CPUTIME_NICE];
idle_time += jiffies_to_usecs(cur_nice_jiffies);
idle_time += cputime_to_usecs(cur_nice - j_cdbs->prev_cpu_nice);
j_cdbs->prev_cpu_nice = cur_nice;
}
if (unlikely(!wall_time || wall_time < idle_time))
......@@ -128,10 +226,10 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
* dropped down. So we perform the copy only once, upon the
* first wake-up from idle.)
*
* Detecting this situation is easy: the governor's deferrable
* timer would not have fired during CPU-idle periods. Hence
* an unusually large 'wall_time' (as compared to the sampling
* rate) indicates this scenario.
* Detecting this situation is easy: the governor's utilization
* update handler would not have run during CPU-idle periods.
* Hence, an unusually large 'wall_time' (as compared to the
* sampling rate) indicates this scenario.
*
* prev_load can be zero in two cases and we must recalculate it
* for both cases:
......@@ -156,222 +254,224 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
if (load > max_load)
max_load = load;
}
dbs_data->cdata->gov_check_cpu(cpu, max_load);
return max_load;
}
EXPORT_SYMBOL_GPL(dbs_check_cpu);
EXPORT_SYMBOL_GPL(dbs_update);
void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay)
static void gov_set_update_util(struct policy_dbs_info *policy_dbs,
unsigned int delay_us)
{
struct dbs_data *dbs_data = policy->governor_data;
struct cpu_dbs_info *cdbs;
struct cpufreq_policy *policy = policy_dbs->policy;
int cpu;
gov_update_sample_delay(policy_dbs, delay_us);
policy_dbs->last_sample_time = 0;
for_each_cpu(cpu, policy->cpus) {
cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
cdbs->timer.expires = jiffies + delay;
add_timer_on(&cdbs->timer, cpu);
struct cpu_dbs_info *cdbs = &per_cpu(cpu_dbs, cpu);
cpufreq_set_update_util_data(cpu, &cdbs->update_util);
}
}
EXPORT_SYMBOL_GPL(gov_add_timers);
static inline void gov_cancel_timers(struct cpufreq_policy *policy)
static inline void gov_clear_update_util(struct cpufreq_policy *policy)
{
struct dbs_data *dbs_data = policy->governor_data;
struct cpu_dbs_info *cdbs;
int i;
for_each_cpu(i, policy->cpus) {
cdbs = dbs_data->cdata->get_cpu_cdbs(i);
del_timer_sync(&cdbs->timer);
}
}
for_each_cpu(i, policy->cpus)
cpufreq_set_update_util_data(i, NULL);
void gov_cancel_work(struct cpu_common_dbs_info *shared)
{
/* Tell dbs_timer_handler() to skip queuing up work items. */
atomic_inc(&shared->skip_work);
/*
* If dbs_timer_handler() is already running, it may not notice the
* incremented skip_work, so wait for it to complete to prevent its work
* item from being queued up after the cancel_work_sync() below.
*/
gov_cancel_timers(shared->policy);
/*
* In case dbs_timer_handler() managed to run and spawn a work item
* before the timers have been canceled, wait for that work item to
* complete and then cancel all of the timers set up by it. If
* dbs_timer_handler() runs again at that point, it will see the
* positive value of skip_work and won't spawn any more work items.
*/
cancel_work_sync(&shared->work);
gov_cancel_timers(shared->policy);
atomic_set(&shared->skip_work, 0);
synchronize_sched();
}
EXPORT_SYMBOL_GPL(gov_cancel_work);
/* Will return if we need to evaluate cpu load again or not */
static bool need_load_eval(struct cpu_common_dbs_info *shared,
unsigned int sampling_rate)
static void gov_cancel_work(struct cpufreq_policy *policy)
{
if (policy_is_shared(shared->policy)) {
ktime_t time_now = ktime_get();
s64 delta_us = ktime_us_delta(time_now, shared->time_stamp);
/* Do nothing if we recently have sampled */
if (delta_us < (s64)(sampling_rate / 2))
return false;
else
shared->time_stamp = time_now;
}
struct policy_dbs_info *policy_dbs = policy->governor_data;
return true;
gov_clear_update_util(policy_dbs->policy);
irq_work_sync(&policy_dbs->irq_work);
cancel_work_sync(&policy_dbs->work);
atomic_set(&policy_dbs->work_count, 0);
policy_dbs->work_in_progress = false;
}
static void dbs_work_handler(struct work_struct *work)
{
struct cpu_common_dbs_info *shared = container_of(work, struct
cpu_common_dbs_info, work);
struct policy_dbs_info *policy_dbs;
struct cpufreq_policy *policy;
struct dbs_data *dbs_data;
unsigned int sampling_rate, delay;
bool eval_load;
struct dbs_governor *gov;
policy = shared->policy;
dbs_data = policy->governor_data;
policy_dbs = container_of(work, struct policy_dbs_info, work);
policy = policy_dbs->policy;
gov = dbs_governor_of(policy);
/* Kill all timers */
gov_cancel_timers(policy);
if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
sampling_rate = cs_tuners->sampling_rate;
} else {
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
sampling_rate = od_tuners->sampling_rate;
}
eval_load = need_load_eval(shared, sampling_rate);
/*
* Make sure cpufreq_governor_limits() isn't evaluating load or the
* ondemand governor isn't updating the sampling rate in parallel.
*/
mutex_lock(&policy_dbs->timer_mutex);
gov_update_sample_delay(policy_dbs, gov->gov_dbs_timer(policy));
mutex_unlock(&policy_dbs->timer_mutex);
/* Allow the utilization update handler to queue up more work. */
atomic_set(&policy_dbs->work_count, 0);
/*
* Make sure cpufreq_governor_limits() isn't evaluating load in
* parallel.
* If the update below is reordered with respect to the sample delay
* modification, the utilization update handler may end up using a stale
* sample delay value.
*/
mutex_lock(&shared->timer_mutex);
delay = dbs_data->cdata->gov_dbs_timer(policy, eval_load);
mutex_unlock(&shared->timer_mutex);
smp_wmb();
policy_dbs->work_in_progress = false;
}
atomic_dec(&shared->skip_work);
static void dbs_irq_work(struct irq_work *irq_work)
{
struct policy_dbs_info *policy_dbs;
gov_add_timers(policy, delay);
policy_dbs = container_of(irq_work, struct policy_dbs_info, irq_work);
schedule_work(&policy_dbs->work);
}
static void dbs_timer_handler(unsigned long data)
static void dbs_update_util_handler(struct update_util_data *data, u64 time,
unsigned long util, unsigned long max)
{
struct cpu_dbs_info *cdbs = (struct cpu_dbs_info *)data;
struct cpu_common_dbs_info *shared = cdbs->shared;
struct cpu_dbs_info *cdbs = container_of(data, struct cpu_dbs_info, update_util);
struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
u64 delta_ns, lst;
/*
* Timer handler may not be allowed to queue the work at the moment,
* because:
* - Another timer handler has done that
* - We are stopping the governor
* - Or we are updating the sampling rate of the ondemand governor
* The work may not be allowed to be queued up right now.
* Possible reasons:
* - Work has already been queued up or is in progress.
* - It is too early (too little time from the previous sample).
*/
if (atomic_inc_return(&shared->skip_work) > 1)
atomic_dec(&shared->skip_work);
else
queue_work(system_wq, &shared->work);
}
if (policy_dbs->work_in_progress)
return;
static void set_sampling_rate(struct dbs_data *dbs_data,
unsigned int sampling_rate)
{
if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
cs_tuners->sampling_rate = sampling_rate;
} else {
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
od_tuners->sampling_rate = sampling_rate;
/*
* If the reads below are reordered before the check above, the value
* of sample_delay_ns used in the computation may be stale.
*/
smp_rmb();
lst = READ_ONCE(policy_dbs->last_sample_time);
delta_ns = time - lst;
if ((s64)delta_ns < policy_dbs->sample_delay_ns)
return;
/*
* If the policy is not shared, the irq_work may be queued up right away
* at this point. Otherwise, we need to ensure that only one of the
* CPUs sharing the policy will do that.
*/
if (policy_dbs->is_shared) {
if (!atomic_add_unless(&policy_dbs->work_count, 1, 1))
return;
/*
* If another CPU updated last_sample_time in the meantime, we
* shouldn't be here, so clear the work counter and bail out.
*/
if (unlikely(lst != READ_ONCE(policy_dbs->last_sample_time))) {
atomic_set(&policy_dbs->work_count, 0);
return;
}
}
policy_dbs->last_sample_time = time;
policy_dbs->work_in_progress = true;
irq_work_queue(&policy_dbs->irq_work);
}
static int alloc_common_dbs_info(struct cpufreq_policy *policy,
struct common_dbs_data *cdata)
static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *policy,
struct dbs_governor *gov)
{
struct cpu_common_dbs_info *shared;
struct policy_dbs_info *policy_dbs;
int j;
/* Allocate memory for the common information for policy->cpus */
shared = kzalloc(sizeof(*shared), GFP_KERNEL);
if (!shared)
return -ENOMEM;
/* Allocate memory for per-policy governor data. */
policy_dbs = gov->alloc();
if (!policy_dbs)
return NULL;
/* Set shared for all CPUs, online+offline */
for_each_cpu(j, policy->related_cpus)
cdata->get_cpu_cdbs(j)->shared = shared;
policy_dbs->policy = policy;
mutex_init(&policy_dbs->timer_mutex);
atomic_set(&policy_dbs->work_count, 0);
init_irq_work(&policy_dbs->irq_work, dbs_irq_work);
INIT_WORK(&policy_dbs->work, dbs_work_handler);
mutex_init(&shared->timer_mutex);
atomic_set(&shared->skip_work, 0);
INIT_WORK(&shared->work, dbs_work_handler);
return 0;
/* Set policy_dbs for all CPUs, online+offline */
for_each_cpu(j, policy->related_cpus) {
struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
j_cdbs->policy_dbs = policy_dbs;
j_cdbs->update_util.func = dbs_update_util_handler;
}
return policy_dbs;
}
static void free_common_dbs_info(struct cpufreq_policy *policy,
struct common_dbs_data *cdata)
static void free_policy_dbs_info(struct policy_dbs_info *policy_dbs,
struct dbs_governor *gov)
{
struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(policy->cpu);
struct cpu_common_dbs_info *shared = cdbs->shared;
int j;
mutex_destroy(&shared->timer_mutex);
mutex_destroy(&policy_dbs->timer_mutex);
for_each_cpu(j, policy->cpus)
cdata->get_cpu_cdbs(j)->shared = NULL;
for_each_cpu(j, policy_dbs->policy->related_cpus) {
struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
kfree(shared);
j_cdbs->policy_dbs = NULL;
j_cdbs->update_util.func = NULL;
}
gov->free(policy_dbs);
}
static int cpufreq_governor_init(struct cpufreq_policy *policy,
struct dbs_data *dbs_data,
struct common_dbs_data *cdata)
static int cpufreq_governor_init(struct cpufreq_policy *policy)
{
struct dbs_governor *gov = dbs_governor_of(policy);
struct dbs_data *dbs_data;
struct policy_dbs_info *policy_dbs;
unsigned int latency;
int ret;
int ret = 0;
/* State should be equivalent to EXIT */
if (policy->governor_data)
return -EBUSY;
if (dbs_data) {
if (WARN_ON(have_governor_per_policy()))
return -EINVAL;
policy_dbs = alloc_policy_dbs_info(policy, gov);
if (!policy_dbs)
return -ENOMEM;
ret = alloc_common_dbs_info(policy, cdata);
if (ret)
return ret;
/* Protect gov->gdbs_data against concurrent updates. */
mutex_lock(&gov_dbs_data_mutex);
dbs_data = gov->gdbs_data;
if (dbs_data) {
if (WARN_ON(have_governor_per_policy())) {
ret = -EINVAL;
goto free_policy_dbs_info;
}
policy_dbs->dbs_data = dbs_data;
policy->governor_data = policy_dbs;
mutex_lock(&dbs_data->mutex);
dbs_data->usage_count++;
policy->governor_data = dbs_data;
return 0;
list_add(&policy_dbs->list, &dbs_data->policy_dbs_list);
mutex_unlock(&dbs_data->mutex);
goto out;
}
dbs_data = kzalloc(sizeof(*dbs_data), GFP_KERNEL);
if (!dbs_data)
return -ENOMEM;
if (!dbs_data) {
ret = -ENOMEM;
goto free_policy_dbs_info;
}
ret = alloc_common_dbs_info(policy, cdata);
if (ret)
goto free_dbs_data;
INIT_LIST_HEAD(&dbs_data->policy_dbs_list);
mutex_init(&dbs_data->mutex);
dbs_data->cdata = cdata;
dbs_data->usage_count = 1;
ret = cdata->init(dbs_data, !policy->governor->initialized);
ret = gov->init(dbs_data, !policy->governor->initialized);
if (ret)
goto free_common_dbs_info;
goto free_policy_dbs_info;
/* policy latency is in ns. Convert it to us first */
latency = policy->cpuinfo.transition_latency / 1000;
......@@ -381,216 +481,156 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy,
/* Bring kernel and HW constraints together */
dbs_data->min_sampling_rate = max(dbs_data->min_sampling_rate,
MIN_LATENCY_MULTIPLIER * latency);
set_sampling_rate(dbs_data, max(dbs_data->min_sampling_rate,
latency * LATENCY_MULTIPLIER));
dbs_data->sampling_rate = max(dbs_data->min_sampling_rate,
LATENCY_MULTIPLIER * latency);
if (!have_governor_per_policy())
cdata->gdbs_data = dbs_data;
gov->gdbs_data = dbs_data;
policy->governor_data = dbs_data;
policy->governor_data = policy_dbs;
ret = sysfs_create_group(get_governor_parent_kobj(policy),
get_sysfs_attr(dbs_data));
if (ret)
goto reset_gdbs_data;
policy_dbs->dbs_data = dbs_data;
dbs_data->usage_count = 1;
list_add(&policy_dbs->list, &dbs_data->policy_dbs_list);
return 0;
gov->kobj_type.sysfs_ops = &governor_sysfs_ops;
ret = kobject_init_and_add(&dbs_data->kobj, &gov->kobj_type,
get_governor_parent_kobj(policy),
"%s", gov->gov.name);
if (!ret)
goto out;
/* Failure, so roll back. */
pr_err("cpufreq: Governor initialization failed (dbs_data kobject init error %d)\n", ret);
reset_gdbs_data:
policy->governor_data = NULL;
if (!have_governor_per_policy())
cdata->gdbs_data = NULL;
cdata->exit(dbs_data, !policy->governor->initialized);
free_common_dbs_info:
free_common_dbs_info(policy, cdata);
free_dbs_data:
gov->gdbs_data = NULL;
gov->exit(dbs_data, !policy->governor->initialized);
kfree(dbs_data);
free_policy_dbs_info:
free_policy_dbs_info(policy_dbs, gov);
out:
mutex_unlock(&gov_dbs_data_mutex);
return ret;
}
static int cpufreq_governor_exit(struct cpufreq_policy *policy,
struct dbs_data *dbs_data)
static int cpufreq_governor_exit(struct cpufreq_policy *policy)
{
struct common_dbs_data *cdata = dbs_data->cdata;
struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(policy->cpu);
struct dbs_governor *gov = dbs_governor_of(policy);
struct policy_dbs_info *policy_dbs = policy->governor_data;
struct dbs_data *dbs_data = policy_dbs->dbs_data;
int count;
/* State should be equivalent to INIT */
if (!cdbs->shared || cdbs->shared->policy)
return -EBUSY;
/* Protect gov->gdbs_data against concurrent updates. */
mutex_lock(&gov_dbs_data_mutex);
mutex_lock(&dbs_data->mutex);
list_del(&policy_dbs->list);
count = --dbs_data->usage_count;
mutex_unlock(&dbs_data->mutex);
if (!--dbs_data->usage_count) {
sysfs_remove_group(get_governor_parent_kobj(policy),
get_sysfs_attr(dbs_data));
if (!count) {
kobject_put(&dbs_data->kobj);
policy->governor_data = NULL;
if (!have_governor_per_policy())
cdata->gdbs_data = NULL;
gov->gdbs_data = NULL;
cdata->exit(dbs_data, policy->governor->initialized == 1);
gov->exit(dbs_data, policy->governor->initialized == 1);
mutex_destroy(&dbs_data->mutex);
kfree(dbs_data);
} else {
policy->governor_data = NULL;
}
free_common_dbs_info(policy, cdata);
free_policy_dbs_info(policy_dbs, gov);
mutex_unlock(&gov_dbs_data_mutex);
return 0;
}
static int cpufreq_governor_start(struct cpufreq_policy *policy,
struct dbs_data *dbs_data)
static int cpufreq_governor_start(struct cpufreq_policy *policy)
{
struct common_dbs_data *cdata = dbs_data->cdata;
unsigned int sampling_rate, ignore_nice, j, cpu = policy->cpu;
struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(cpu);
struct cpu_common_dbs_info *shared = cdbs->shared;
int io_busy = 0;
struct dbs_governor *gov = dbs_governor_of(policy);
struct policy_dbs_info *policy_dbs = policy->governor_data;
struct dbs_data *dbs_data = policy_dbs->dbs_data;
unsigned int sampling_rate, ignore_nice, j;
unsigned int io_busy;
if (!policy->cur)
return -EINVAL;
/* State should be equivalent to INIT */
if (!shared || shared->policy)
return -EBUSY;
if (cdata->governor == GOV_CONSERVATIVE) {
struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
policy_dbs->is_shared = policy_is_shared(policy);
policy_dbs->rate_mult = 1;
sampling_rate = cs_tuners->sampling_rate;
ignore_nice = cs_tuners->ignore_nice_load;
} else {
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
sampling_rate = od_tuners->sampling_rate;
ignore_nice = od_tuners->ignore_nice_load;
io_busy = od_tuners->io_is_busy;
}
shared->policy = policy;
shared->time_stamp = ktime_get();
sampling_rate = dbs_data->sampling_rate;
ignore_nice = dbs_data->ignore_nice_load;
io_busy = dbs_data->io_is_busy;
for_each_cpu(j, policy->cpus) {
struct cpu_dbs_info *j_cdbs = cdata->get_cpu_cdbs(j);
struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
unsigned int prev_load;
j_cdbs->prev_cpu_idle =
get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall, io_busy);
j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall, io_busy);
prev_load = (unsigned int)(j_cdbs->prev_cpu_wall -
j_cdbs->prev_cpu_idle);
j_cdbs->prev_load = 100 * prev_load /
(unsigned int)j_cdbs->prev_cpu_wall;
prev_load = j_cdbs->prev_cpu_wall - j_cdbs->prev_cpu_idle;
j_cdbs->prev_load = 100 * prev_load / (unsigned int)j_cdbs->prev_cpu_wall;
if (ignore_nice)
j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
__setup_timer(&j_cdbs->timer, dbs_timer_handler,
(unsigned long)j_cdbs,
TIMER_DEFERRABLE | TIMER_IRQSAFE);
}
if (cdata->governor == GOV_CONSERVATIVE) {
struct cs_cpu_dbs_info_s *cs_dbs_info =
cdata->get_cpu_dbs_info_s(cpu);
gov->start(policy);
cs_dbs_info->down_skip = 0;
cs_dbs_info->requested_freq = policy->cur;
} else {
struct od_ops *od_ops = cdata->gov_ops;
struct od_cpu_dbs_info_s *od_dbs_info = cdata->get_cpu_dbs_info_s(cpu);
od_dbs_info->rate_mult = 1;
od_dbs_info->sample_type = OD_NORMAL_SAMPLE;
od_ops->powersave_bias_init_cpu(cpu);
}
gov_add_timers(policy, delay_for_sampling_rate(sampling_rate));
gov_set_update_util(policy_dbs, sampling_rate);
return 0;
}
static int cpufreq_governor_stop(struct cpufreq_policy *policy,
struct dbs_data *dbs_data)
static int cpufreq_governor_stop(struct cpufreq_policy *policy)
{
struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(policy->cpu);
struct cpu_common_dbs_info *shared = cdbs->shared;
/* State should be equivalent to START */
if (!shared || !shared->policy)
return -EBUSY;
gov_cancel_work(shared);
shared->policy = NULL;
gov_cancel_work(policy);
return 0;
}
static int cpufreq_governor_limits(struct cpufreq_policy *policy,
struct dbs_data *dbs_data)
static int cpufreq_governor_limits(struct cpufreq_policy *policy)
{
struct common_dbs_data *cdata = dbs_data->cdata;
unsigned int cpu = policy->cpu;
struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(cpu);
struct policy_dbs_info *policy_dbs = policy->governor_data;
/* State should be equivalent to START */
if (!cdbs->shared || !cdbs->shared->policy)
return -EBUSY;
mutex_lock(&policy_dbs->timer_mutex);
if (policy->max < policy->cur)
__cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
else if (policy->min > policy->cur)
__cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
mutex_lock(&cdbs->shared->timer_mutex);
if (policy->max < cdbs->shared->policy->cur)
__cpufreq_driver_target(cdbs->shared->policy, policy->max,
CPUFREQ_RELATION_H);
else if (policy->min > cdbs->shared->policy->cur)
__cpufreq_driver_target(cdbs->shared->policy, policy->min,
CPUFREQ_RELATION_L);
dbs_check_cpu(dbs_data, cpu);
mutex_unlock(&cdbs->shared->timer_mutex);
gov_update_sample_delay(policy_dbs, 0);
mutex_unlock(&policy_dbs->timer_mutex);
return 0;
}
int cpufreq_governor_dbs(struct cpufreq_policy *policy,
struct common_dbs_data *cdata, unsigned int event)
int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event)
{
struct dbs_data *dbs_data;
int ret;
/* Lock governor to block concurrent initialization of governor */
mutex_lock(&cdata->mutex);
if (have_governor_per_policy())
dbs_data = policy->governor_data;
else
dbs_data = cdata->gdbs_data;
if (!dbs_data && (event != CPUFREQ_GOV_POLICY_INIT)) {
ret = -EINVAL;
goto unlock;
}
if (event == CPUFREQ_GOV_POLICY_INIT) {
return cpufreq_governor_init(policy);
} else if (policy->governor_data) {
switch (event) {
case CPUFREQ_GOV_POLICY_INIT:
ret = cpufreq_governor_init(policy, dbs_data, cdata);
break;
case CPUFREQ_GOV_POLICY_EXIT:
ret = cpufreq_governor_exit(policy, dbs_data);
break;
return cpufreq_governor_exit(policy);
case CPUFREQ_GOV_START:
ret = cpufreq_governor_start(policy, dbs_data);
break;
return cpufreq_governor_start(policy);
case CPUFREQ_GOV_STOP:
ret = cpufreq_governor_stop(policy, dbs_data);
break;
return cpufreq_governor_stop(policy);
case CPUFREQ_GOV_LIMITS:
ret = cpufreq_governor_limits(policy, dbs_data);
break;
default:
ret = -EINVAL;
return cpufreq_governor_limits(policy);
}
unlock:
mutex_unlock(&cdata->mutex);
return ret;
}
return -EINVAL;
}
EXPORT_SYMBOL_GPL(cpufreq_governor_dbs);
......@@ -18,6 +18,7 @@
#define _CPUFREQ_GOVERNOR_H
#include <linux/atomic.h>
#include <linux/irq_work.h>
#include <linux/cpufreq.h>
#include <linux/kernel_stat.h>
#include <linux/module.h>
......@@ -41,96 +42,68 @@
enum {OD_NORMAL_SAMPLE, OD_SUB_SAMPLE};
/*
* Macro for creating governors sysfs routines
*
* - gov_sys: One governor instance per whole system
* - gov_pol: One governor instance per policy
* Abbreviations:
* dbs: used as a shortform for demand based switching It helps to keep variable
* names smaller, simpler
* cdbs: common dbs
* od_*: On-demand governor
* cs_*: Conservative governor
*/
/* Create attributes */
#define gov_sys_attr_ro(_name) \
static struct global_attr _name##_gov_sys = \
__ATTR(_name, 0444, show_##_name##_gov_sys, NULL)
#define gov_sys_attr_rw(_name) \
static struct global_attr _name##_gov_sys = \
__ATTR(_name, 0644, show_##_name##_gov_sys, store_##_name##_gov_sys)
#define gov_pol_attr_ro(_name) \
static struct freq_attr _name##_gov_pol = \
__ATTR(_name, 0444, show_##_name##_gov_pol, NULL)
#define gov_pol_attr_rw(_name) \
static struct freq_attr _name##_gov_pol = \
__ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
/* Governor demand based switching data (per-policy or global). */
struct dbs_data {
int usage_count;
void *tuners;
unsigned int min_sampling_rate;
unsigned int ignore_nice_load;
unsigned int sampling_rate;
unsigned int sampling_down_factor;
unsigned int up_threshold;
unsigned int io_is_busy;
#define gov_sys_pol_attr_rw(_name) \
gov_sys_attr_rw(_name); \
gov_pol_attr_rw(_name)
struct kobject kobj;
struct list_head policy_dbs_list;
/*
* Protect concurrent updates to governor tunables from sysfs,
* policy_dbs_list and usage_count.
*/
struct mutex mutex;
};
#define gov_sys_pol_attr_ro(_name) \
gov_sys_attr_ro(_name); \
gov_pol_attr_ro(_name)
/* Governor's specific attributes */
struct dbs_data;
struct governor_attr {
struct attribute attr;
ssize_t (*show)(struct dbs_data *dbs_data, char *buf);
ssize_t (*store)(struct dbs_data *dbs_data, const char *buf,
size_t count);
};
/* Create show/store routines */
#define show_one(_gov, file_name) \
static ssize_t show_##file_name##_gov_sys \
(struct kobject *kobj, struct attribute *attr, char *buf) \
#define gov_show_one(_gov, file_name) \
static ssize_t show_##file_name \
(struct dbs_data *dbs_data, char *buf) \
{ \
struct _gov##_dbs_tuners *tuners = _gov##_dbs_cdata.gdbs_data->tuners; \
return sprintf(buf, "%u\n", tuners->file_name); \
} \
\
static ssize_t show_##file_name##_gov_pol \
(struct cpufreq_policy *policy, char *buf) \
{ \
struct dbs_data *dbs_data = policy->governor_data; \
struct _gov##_dbs_tuners *tuners = dbs_data->tuners; \
return sprintf(buf, "%u\n", tuners->file_name); \
}
#define store_one(_gov, file_name) \
static ssize_t store_##file_name##_gov_sys \
(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) \
{ \
struct dbs_data *dbs_data = _gov##_dbs_cdata.gdbs_data; \
return store_##file_name(dbs_data, buf, count); \
} \
\
static ssize_t store_##file_name##_gov_pol \
(struct cpufreq_policy *policy, const char *buf, size_t count) \
#define gov_show_one_common(file_name) \
static ssize_t show_##file_name \
(struct dbs_data *dbs_data, char *buf) \
{ \
struct dbs_data *dbs_data = policy->governor_data; \
return store_##file_name(dbs_data, buf, count); \
return sprintf(buf, "%u\n", dbs_data->file_name); \
}
#define show_store_one(_gov, file_name) \
show_one(_gov, file_name); \
store_one(_gov, file_name)
#define gov_attr_ro(_name) \
static struct governor_attr _name = \
__ATTR(_name, 0444, show_##_name, NULL)
/* create helper routines */
#define define_get_cpu_dbs_routines(_dbs_info) \
static struct cpu_dbs_info *get_cpu_cdbs(int cpu) \
{ \
return &per_cpu(_dbs_info, cpu).cdbs; \
} \
\
static void *get_cpu_dbs_info_s(int cpu) \
{ \
return &per_cpu(_dbs_info, cpu); \
}
/*
* Abbreviations:
* dbs: used as a shortform for demand based switching It helps to keep variable
* names smaller, simpler
* cdbs: common dbs
* od_*: On-demand governor
* cs_*: Conservative governor
*/
#define gov_attr_rw(_name) \
static struct governor_attr _name = \
__ATTR(_name, 0644, show_##_name, store_##_name)
/* Common to all CPUs of a policy */
struct cpu_common_dbs_info {
struct policy_dbs_info {
struct cpufreq_policy *policy;
/*
* Per policy mutex that serializes load evaluation from limit-change
......@@ -138,11 +111,27 @@ struct cpu_common_dbs_info {
*/
struct mutex timer_mutex;
ktime_t time_stamp;
atomic_t skip_work;
u64 last_sample_time;
s64 sample_delay_ns;
atomic_t work_count;
struct irq_work irq_work;
struct work_struct work;
/* dbs_data may be shared between multiple policy objects */
struct dbs_data *dbs_data;
struct list_head list;
/* Multiplier for increasing sample delay temporarily. */
unsigned int rate_mult;
/* Status indicators */
bool is_shared; /* This object is used by multiple CPUs */
bool work_in_progress; /* Work is being queued up or in progress */
};
static inline void gov_update_sample_delay(struct policy_dbs_info *policy_dbs,
unsigned int delay_us)
{
policy_dbs->sample_delay_ns = delay_us * NSEC_PER_USEC;
}
/* Per cpu structures */
struct cpu_dbs_info {
u64 prev_cpu_idle;
......@@ -155,54 +144,14 @@ struct cpu_dbs_info {
* wake-up from idle.
*/
unsigned int prev_load;
struct timer_list timer;
struct cpu_common_dbs_info *shared;
};
struct od_cpu_dbs_info_s {
struct cpu_dbs_info cdbs;
struct cpufreq_frequency_table *freq_table;
unsigned int freq_lo;
unsigned int freq_lo_jiffies;
unsigned int freq_hi_jiffies;
unsigned int rate_mult;
unsigned int sample_type:1;
};
struct cs_cpu_dbs_info_s {
struct cpu_dbs_info cdbs;
unsigned int down_skip;
unsigned int requested_freq;
};
/* Per policy Governors sysfs tunables */
struct od_dbs_tuners {
unsigned int ignore_nice_load;
unsigned int sampling_rate;
unsigned int sampling_down_factor;
unsigned int up_threshold;
unsigned int powersave_bias;
unsigned int io_is_busy;
};
struct cs_dbs_tuners {
unsigned int ignore_nice_load;
unsigned int sampling_rate;
unsigned int sampling_down_factor;
unsigned int up_threshold;
unsigned int down_threshold;
unsigned int freq_step;
struct update_util_data update_util;
struct policy_dbs_info *policy_dbs;
};
/* Common Governor data across policies */
struct dbs_data;
struct common_dbs_data {
/* Common across governors */
#define GOV_ONDEMAND 0
#define GOV_CONSERVATIVE 1
int governor;
struct attribute_group *attr_group_gov_sys; /* one governor - system */
struct attribute_group *attr_group_gov_pol; /* one governor - policy */
struct dbs_governor {
struct cpufreq_governor gov;
struct kobj_type kobj_type;
/*
* Common data for platforms that don't set
......@@ -210,74 +159,32 @@ struct common_dbs_data {
*/
struct dbs_data *gdbs_data;
struct cpu_dbs_info *(*get_cpu_cdbs)(int cpu);
void *(*get_cpu_dbs_info_s)(int cpu);
unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy,
bool modify_all);
void (*gov_check_cpu)(int cpu, unsigned int load);
unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy);
struct policy_dbs_info *(*alloc)(void);
void (*free)(struct policy_dbs_info *policy_dbs);
int (*init)(struct dbs_data *dbs_data, bool notify);
void (*exit)(struct dbs_data *dbs_data, bool notify);
/* Governor specific ops, see below */
void *gov_ops;
/*
* Protects governor's data (struct dbs_data and struct common_dbs_data)
*/
struct mutex mutex;
void (*start)(struct cpufreq_policy *policy);
};
/* Governor Per policy data */
struct dbs_data {
struct common_dbs_data *cdata;
unsigned int min_sampling_rate;
int usage_count;
void *tuners;
};
static inline struct dbs_governor *dbs_governor_of(struct cpufreq_policy *policy)
{
return container_of(policy->governor, struct dbs_governor, gov);
}
/* Governor specific ops, will be passed to dbs_data->gov_ops */
/* Governor specific operations */
struct od_ops {
void (*powersave_bias_init_cpu)(int cpu);
unsigned int (*powersave_bias_target)(struct cpufreq_policy *policy,
unsigned int freq_next, unsigned int relation);
void (*freq_increase)(struct cpufreq_policy *policy, unsigned int freq);
};
static inline int delay_for_sampling_rate(unsigned int sampling_rate)
{
int delay = usecs_to_jiffies(sampling_rate);
/* We want all CPUs to do sampling nearly on same jiffy */
if (num_online_cpus() > 1)
delay -= jiffies % delay;
return delay;
}
#define declare_show_sampling_rate_min(_gov) \
static ssize_t show_sampling_rate_min_gov_sys \
(struct kobject *kobj, struct attribute *attr, char *buf) \
{ \
struct dbs_data *dbs_data = _gov##_dbs_cdata.gdbs_data; \
return sprintf(buf, "%u\n", dbs_data->min_sampling_rate); \
} \
\
static ssize_t show_sampling_rate_min_gov_pol \
(struct cpufreq_policy *policy, char *buf) \
{ \
struct dbs_data *dbs_data = policy->governor_data; \
return sprintf(buf, "%u\n", dbs_data->min_sampling_rate); \
}
extern struct mutex cpufreq_governor_lock;
void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay);
void gov_cancel_work(struct cpu_common_dbs_info *shared);
void dbs_check_cpu(struct dbs_data *dbs_data, int cpu);
int cpufreq_governor_dbs(struct cpufreq_policy *policy,
struct common_dbs_data *cdata, unsigned int event);
unsigned int dbs_update(struct cpufreq_policy *policy);
int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event);
void od_register_powersave_bias_handler(unsigned int (*f)
(struct cpufreq_policy *, unsigned int, unsigned int),
unsigned int powersave_bias);
void od_unregister_powersave_bias_handler(void);
ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
size_t count);
void gov_update_cpu_data(struct dbs_data *dbs_data);
#endif /* _CPUFREQ_GOVERNOR_H */
......@@ -16,7 +16,8 @@
#include <linux/percpu-defs.h>
#include <linux/slab.h>
#include <linux/tick.h>
#include "cpufreq_governor.h"
#include "cpufreq_ondemand.h"
/* On-demand governor macros */
#define DEF_FREQUENCY_UP_THRESHOLD (80)
......@@ -27,24 +28,10 @@
#define MIN_FREQUENCY_UP_THRESHOLD (11)
#define MAX_FREQUENCY_UP_THRESHOLD (100)
static DEFINE_PER_CPU(struct od_cpu_dbs_info_s, od_cpu_dbs_info);
static struct od_ops od_ops;
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
static struct cpufreq_governor cpufreq_gov_ondemand;
#endif
static unsigned int default_powersave_bias;
static void ondemand_powersave_bias_init_cpu(int cpu)
{
struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
dbs_info->freq_lo = 0;
}
/*
* Not all CPUs want IO time to be accounted as busy; this depends on how
* efficient idling at a higher frequency/voltage is.
......@@ -70,8 +57,8 @@ static int should_io_be_busy(void)
/*
* Find right freq to be set now with powersave_bias on.
* Returns the freq_hi to be used right now and will set freq_hi_jiffies,
* freq_lo, and freq_lo_jiffies in percpu area for averaging freqs.
* Returns the freq_hi to be used right now and will set freq_hi_delay_us,
* freq_lo, and freq_lo_delay_us in percpu area for averaging freqs.
*/
static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
unsigned int freq_next, unsigned int relation)
......@@ -79,15 +66,15 @@ static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
unsigned int freq_req, freq_reduc, freq_avg;
unsigned int freq_hi, freq_lo;
unsigned int index = 0;
unsigned int jiffies_total, jiffies_hi, jiffies_lo;
struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
policy->cpu);
struct dbs_data *dbs_data = policy->governor_data;
unsigned int delay_hi_us;
struct policy_dbs_info *policy_dbs = policy->governor_data;
struct od_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
struct dbs_data *dbs_data = policy_dbs->dbs_data;
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
if (!dbs_info->freq_table) {
dbs_info->freq_lo = 0;
dbs_info->freq_lo_jiffies = 0;
dbs_info->freq_lo_delay_us = 0;
return freq_next;
}
......@@ -110,31 +97,30 @@ static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
/* Find out how long we have to be in hi and lo freqs */
if (freq_hi == freq_lo) {
dbs_info->freq_lo = 0;
dbs_info->freq_lo_jiffies = 0;
dbs_info->freq_lo_delay_us = 0;
return freq_lo;
}
jiffies_total = usecs_to_jiffies(od_tuners->sampling_rate);
jiffies_hi = (freq_avg - freq_lo) * jiffies_total;
jiffies_hi += ((freq_hi - freq_lo) / 2);
jiffies_hi /= (freq_hi - freq_lo);
jiffies_lo = jiffies_total - jiffies_hi;
delay_hi_us = (freq_avg - freq_lo) * dbs_data->sampling_rate;
delay_hi_us += (freq_hi - freq_lo) / 2;
delay_hi_us /= freq_hi - freq_lo;
dbs_info->freq_hi_delay_us = delay_hi_us;
dbs_info->freq_lo = freq_lo;
dbs_info->freq_lo_jiffies = jiffies_lo;
dbs_info->freq_hi_jiffies = jiffies_hi;
dbs_info->freq_lo_delay_us = dbs_data->sampling_rate - delay_hi_us;
return freq_hi;
}
static void ondemand_powersave_bias_init(void)
static void ondemand_powersave_bias_init(struct cpufreq_policy *policy)
{
int i;
for_each_online_cpu(i) {
ondemand_powersave_bias_init_cpu(i);
}
struct od_policy_dbs_info *dbs_info = to_dbs_info(policy->governor_data);
dbs_info->freq_table = cpufreq_frequency_get_table(policy->cpu);
dbs_info->freq_lo = 0;
}
static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq)
{
struct dbs_data *dbs_data = policy->governor_data;
struct policy_dbs_info *policy_dbs = policy->governor_data;
struct dbs_data *dbs_data = policy_dbs->dbs_data;
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
if (od_tuners->powersave_bias)
......@@ -152,21 +138,21 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq)
* (default), then we try to increase frequency. Else, we adjust the frequency
* proportional to load.
*/
static void od_check_cpu(int cpu, unsigned int load)
static void od_update(struct cpufreq_policy *policy)
{
struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
struct cpufreq_policy *policy = dbs_info->cdbs.shared->policy;
struct dbs_data *dbs_data = policy->governor_data;
struct policy_dbs_info *policy_dbs = policy->governor_data;
struct od_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
struct dbs_data *dbs_data = policy_dbs->dbs_data;
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
unsigned int load = dbs_update(policy);
dbs_info->freq_lo = 0;
/* Check for frequency increase */
if (load > od_tuners->up_threshold) {
if (load > dbs_data->up_threshold) {
/* If switching to max speed, apply sampling_down_factor */
if (policy->cur < policy->max)
dbs_info->rate_mult =
od_tuners->sampling_down_factor;
policy_dbs->rate_mult = dbs_data->sampling_down_factor;
dbs_freq_increase(policy, policy->max);
} else {
/* Calculate the next frequency proportional to load */
......@@ -177,177 +163,70 @@ static void od_check_cpu(int cpu, unsigned int load)
freq_next = min_f + load * (max_f - min_f) / 100;
/* No longer fully busy, reset rate_mult */
dbs_info->rate_mult = 1;
policy_dbs->rate_mult = 1;
if (!od_tuners->powersave_bias) {
__cpufreq_driver_target(policy, freq_next,
CPUFREQ_RELATION_C);
return;
}
freq_next = od_ops.powersave_bias_target(policy, freq_next,
if (od_tuners->powersave_bias)
freq_next = od_ops.powersave_bias_target(policy,
freq_next,
CPUFREQ_RELATION_L);
__cpufreq_driver_target(policy, freq_next, CPUFREQ_RELATION_C);
}
}
static unsigned int od_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
{
struct dbs_data *dbs_data = policy->governor_data;
unsigned int cpu = policy->cpu;
struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
cpu);
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
int delay = 0, sample_type = dbs_info->sample_type;
if (!modify_all)
goto max_delay;
struct policy_dbs_info *policy_dbs = policy->governor_data;
struct dbs_data *dbs_data = policy_dbs->dbs_data;
struct od_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
int sample_type = dbs_info->sample_type;
/* Common NORMAL_SAMPLE setup */
dbs_info->sample_type = OD_NORMAL_SAMPLE;
if (sample_type == OD_SUB_SAMPLE) {
delay = dbs_info->freq_lo_jiffies;
/*
* OD_SUB_SAMPLE doesn't make sense if sample_delay_ns is 0, so ignore
* it then.
*/
if (sample_type == OD_SUB_SAMPLE && policy_dbs->sample_delay_ns > 0) {
__cpufreq_driver_target(policy, dbs_info->freq_lo,
CPUFREQ_RELATION_H);
} else {
dbs_check_cpu(dbs_data, cpu);
return dbs_info->freq_lo_delay_us;
}
od_update(policy);
if (dbs_info->freq_lo) {
/* Setup timer for SUB_SAMPLE */
dbs_info->sample_type = OD_SUB_SAMPLE;
delay = dbs_info->freq_hi_jiffies;
}
return dbs_info->freq_hi_delay_us;
}
max_delay:
if (!delay)
delay = delay_for_sampling_rate(od_tuners->sampling_rate
* dbs_info->rate_mult);
return delay;
return dbs_data->sampling_rate * policy_dbs->rate_mult;
}
/************************** sysfs interface ************************/
static struct common_dbs_data od_dbs_cdata;
/**
* update_sampling_rate - update sampling rate effective immediately if needed.
* @new_rate: new sampling rate
*
* If new rate is smaller than the old, simply updating
* dbs_tuners_int.sampling_rate might not be appropriate. For example, if the
* original sampling_rate was 1 second and the requested new sampling rate is 10
* ms because the user needs immediate reaction from ondemand governor, but not
* sure if higher frequency will be required or not, then, the governor may
* change the sampling rate too late; up to 1 second later. Thus, if we are
* reducing the sampling rate, we need to make the new value effective
* immediately.
*/
static void update_sampling_rate(struct dbs_data *dbs_data,
unsigned int new_rate)
{
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
struct cpumask cpumask;
int cpu;
od_tuners->sampling_rate = new_rate = max(new_rate,
dbs_data->min_sampling_rate);
/*
* Lock governor so that governor start/stop can't execute in parallel.
*/
mutex_lock(&od_dbs_cdata.mutex);
cpumask_copy(&cpumask, cpu_online_mask);
for_each_cpu(cpu, &cpumask) {
struct cpufreq_policy *policy;
struct od_cpu_dbs_info_s *dbs_info;
struct cpu_dbs_info *cdbs;
struct cpu_common_dbs_info *shared;
unsigned long next_sampling, appointed_at;
dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
cdbs = &dbs_info->cdbs;
shared = cdbs->shared;
/*
* A valid shared and shared->policy means governor hasn't
* stopped or exited yet.
*/
if (!shared || !shared->policy)
continue;
policy = shared->policy;
/* clear all CPUs of this policy */
cpumask_andnot(&cpumask, &cpumask, policy->cpus);
/*
* Update sampling rate for CPUs whose policy is governed by
* dbs_data. In case of governor_per_policy, only a single
* policy will be governed by dbs_data, otherwise there can be
* multiple policies that are governed by the same dbs_data.
*/
if (dbs_data != policy->governor_data)
continue;
/*
* Checking this for any CPU should be fine, timers for all of
* them are scheduled together.
*/
next_sampling = jiffies + usecs_to_jiffies(new_rate);
appointed_at = dbs_info->cdbs.timer.expires;
if (time_before(next_sampling, appointed_at)) {
gov_cancel_work(shared);
gov_add_timers(policy, usecs_to_jiffies(new_rate));
}
}
mutex_unlock(&od_dbs_cdata.mutex);
}
static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
size_t count)
{
unsigned int input;
int ret;
ret = sscanf(buf, "%u", &input);
if (ret != 1)
return -EINVAL;
update_sampling_rate(dbs_data, input);
return count;
}
static struct dbs_governor od_dbs_gov;
static ssize_t store_io_is_busy(struct dbs_data *dbs_data, const char *buf,
size_t count)
{
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
unsigned int input;
int ret;
unsigned int j;
ret = sscanf(buf, "%u", &input);
if (ret != 1)
return -EINVAL;
od_tuners->io_is_busy = !!input;
dbs_data->io_is_busy = !!input;
/* we need to re-evaluate prev_cpu_idle */
for_each_online_cpu(j) {
struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
j);
dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
&dbs_info->cdbs.prev_cpu_wall, od_tuners->io_is_busy);
}
gov_update_cpu_data(dbs_data);
return count;
}
static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
size_t count)
{
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
unsigned int input;
int ret;
ret = sscanf(buf, "%u", &input);
......@@ -357,40 +236,43 @@ static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
return -EINVAL;
}
od_tuners->up_threshold = input;
dbs_data->up_threshold = input;
return count;
}
static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
const char *buf, size_t count)
{
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
unsigned int input, j;
struct policy_dbs_info *policy_dbs;
unsigned int input;
int ret;
ret = sscanf(buf, "%u", &input);
if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1)
return -EINVAL;
od_tuners->sampling_down_factor = input;
dbs_data->sampling_down_factor = input;
/* Reset down sampling multiplier in case it was active */
for_each_online_cpu(j) {
struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
j);
dbs_info->rate_mult = 1;
list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list) {
/*
* Doing this without locking might lead to using different
* rate_mult values in od_update() and od_dbs_timer().
*/
mutex_lock(&policy_dbs->timer_mutex);
policy_dbs->rate_mult = 1;
mutex_unlock(&policy_dbs->timer_mutex);
}
return count;
}
static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
const char *buf, size_t count)
{
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
unsigned int input;
int ret;
unsigned int j;
ret = sscanf(buf, "%u", &input);
if (ret != 1)
return -EINVAL;
......@@ -398,22 +280,14 @@ static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
if (input > 1)
input = 1;
if (input == od_tuners->ignore_nice_load) { /* nothing to do */
if (input == dbs_data->ignore_nice_load) { /* nothing to do */
return count;
}
od_tuners->ignore_nice_load = input;
dbs_data->ignore_nice_load = input;
/* we need to re-evaluate prev_cpu_idle */
for_each_online_cpu(j) {
struct od_cpu_dbs_info_s *dbs_info;
dbs_info = &per_cpu(od_cpu_dbs_info, j);
dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
&dbs_info->cdbs.prev_cpu_wall, od_tuners->io_is_busy);
if (od_tuners->ignore_nice_load)
dbs_info->cdbs.prev_cpu_nice =
kcpustat_cpu(j).cpustat[CPUTIME_NICE];
gov_update_cpu_data(dbs_data);
}
return count;
}
......@@ -421,6 +295,7 @@ static ssize_t store_powersave_bias(struct dbs_data *dbs_data, const char *buf,
size_t count)
{
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
struct policy_dbs_info *policy_dbs;
unsigned int input;
int ret;
ret = sscanf(buf, "%u", &input);
......@@ -432,59 +307,54 @@ static ssize_t store_powersave_bias(struct dbs_data *dbs_data, const char *buf,
input = 1000;
od_tuners->powersave_bias = input;
ondemand_powersave_bias_init();
list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list)
ondemand_powersave_bias_init(policy_dbs->policy);
return count;
}
show_store_one(od, sampling_rate);
show_store_one(od, io_is_busy);
show_store_one(od, up_threshold);
show_store_one(od, sampling_down_factor);
show_store_one(od, ignore_nice_load);
show_store_one(od, powersave_bias);
declare_show_sampling_rate_min(od);
gov_sys_pol_attr_rw(sampling_rate);
gov_sys_pol_attr_rw(io_is_busy);
gov_sys_pol_attr_rw(up_threshold);
gov_sys_pol_attr_rw(sampling_down_factor);
gov_sys_pol_attr_rw(ignore_nice_load);
gov_sys_pol_attr_rw(powersave_bias);
gov_sys_pol_attr_ro(sampling_rate_min);
static struct attribute *dbs_attributes_gov_sys[] = {
&sampling_rate_min_gov_sys.attr,
&sampling_rate_gov_sys.attr,
&up_threshold_gov_sys.attr,
&sampling_down_factor_gov_sys.attr,
&ignore_nice_load_gov_sys.attr,
&powersave_bias_gov_sys.attr,
&io_is_busy_gov_sys.attr,
gov_show_one_common(sampling_rate);
gov_show_one_common(up_threshold);
gov_show_one_common(sampling_down_factor);
gov_show_one_common(ignore_nice_load);
gov_show_one_common(min_sampling_rate);
gov_show_one_common(io_is_busy);
gov_show_one(od, powersave_bias);
gov_attr_rw(sampling_rate);
gov_attr_rw(io_is_busy);
gov_attr_rw(up_threshold);
gov_attr_rw(sampling_down_factor);
gov_attr_rw(ignore_nice_load);
gov_attr_rw(powersave_bias);
gov_attr_ro(min_sampling_rate);
static struct attribute *od_attributes[] = {
&min_sampling_rate.attr,
&sampling_rate.attr,
&up_threshold.attr,
&sampling_down_factor.attr,
&ignore_nice_load.attr,
&powersave_bias.attr,
&io_is_busy.attr,
NULL
};
static struct attribute_group od_attr_group_gov_sys = {
.attrs = dbs_attributes_gov_sys,
.name = "ondemand",
};
/************************** sysfs end ************************/
static struct attribute *dbs_attributes_gov_pol[] = {
&sampling_rate_min_gov_pol.attr,
&sampling_rate_gov_pol.attr,
&up_threshold_gov_pol.attr,
&sampling_down_factor_gov_pol.attr,
&ignore_nice_load_gov_pol.attr,
&powersave_bias_gov_pol.attr,
&io_is_busy_gov_pol.attr,
NULL
};
static struct policy_dbs_info *od_alloc(void)
{
struct od_policy_dbs_info *dbs_info;
static struct attribute_group od_attr_group_gov_pol = {
.attrs = dbs_attributes_gov_pol,
.name = "ondemand",
};
dbs_info = kzalloc(sizeof(*dbs_info), GFP_KERNEL);
return dbs_info ? &dbs_info->policy_dbs : NULL;
}
/************************** sysfs end ************************/
static void od_free(struct policy_dbs_info *policy_dbs)
{
kfree(to_dbs_info(policy_dbs));
}
static int od_init(struct dbs_data *dbs_data, bool notify)
{
......@@ -503,7 +373,7 @@ static int od_init(struct dbs_data *dbs_data, bool notify)
put_cpu();
if (idle_time != -1ULL) {
/* Idle micro accounting is supported. Use finer thresholds */
tuners->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
dbs_data->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
/*
* In nohz/micro accounting case we set the minimum frequency
* not depending on HZ, but fixed (very low). The deferred
......@@ -511,17 +381,17 @@ static int od_init(struct dbs_data *dbs_data, bool notify)
*/
dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE;
} else {
tuners->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
dbs_data->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
/* For correct statistics, we need 10 ticks for each measure */
dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
jiffies_to_usecs(10);
}
tuners->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
tuners->ignore_nice_load = 0;
dbs_data->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
dbs_data->ignore_nice_load = 0;
tuners->powersave_bias = default_powersave_bias;
tuners->io_is_busy = should_io_be_busy();
dbs_data->io_is_busy = should_io_be_busy();
dbs_data->tuners = tuners;
return 0;
......@@ -532,33 +402,38 @@ static void od_exit(struct dbs_data *dbs_data, bool notify)
kfree(dbs_data->tuners);
}
define_get_cpu_dbs_routines(od_cpu_dbs_info);
static void od_start(struct cpufreq_policy *policy)
{
struct od_policy_dbs_info *dbs_info = to_dbs_info(policy->governor_data);
dbs_info->sample_type = OD_NORMAL_SAMPLE;
ondemand_powersave_bias_init(policy);
}
static struct od_ops od_ops = {
.powersave_bias_init_cpu = ondemand_powersave_bias_init_cpu,
.powersave_bias_target = generic_powersave_bias_target,
.freq_increase = dbs_freq_increase,
};
static struct common_dbs_data od_dbs_cdata = {
.governor = GOV_ONDEMAND,
.attr_group_gov_sys = &od_attr_group_gov_sys,
.attr_group_gov_pol = &od_attr_group_gov_pol,
.get_cpu_cdbs = get_cpu_cdbs,
.get_cpu_dbs_info_s = get_cpu_dbs_info_s,
static struct dbs_governor od_dbs_gov = {
.gov = {
.name = "ondemand",
.governor = cpufreq_governor_dbs,
.max_transition_latency = TRANSITION_LATENCY_LIMIT,
.owner = THIS_MODULE,
},
.kobj_type = { .default_attrs = od_attributes },
.gov_dbs_timer = od_dbs_timer,
.gov_check_cpu = od_check_cpu,
.gov_ops = &od_ops,
.alloc = od_alloc,
.free = od_free,
.init = od_init,
.exit = od_exit,
.mutex = __MUTEX_INITIALIZER(od_dbs_cdata.mutex),
.start = od_start,
};
#define CPU_FREQ_GOV_ONDEMAND (&od_dbs_gov.gov)
static void od_set_powersave_bias(unsigned int powersave_bias)
{
struct cpufreq_policy *policy;
struct dbs_data *dbs_data;
struct od_dbs_tuners *od_tuners;
unsigned int cpu;
cpumask_t done;
......@@ -567,22 +442,25 @@ static void od_set_powersave_bias(unsigned int powersave_bias)
get_online_cpus();
for_each_online_cpu(cpu) {
struct cpu_common_dbs_info *shared;
struct cpufreq_policy *policy;
struct policy_dbs_info *policy_dbs;
struct dbs_data *dbs_data;
struct od_dbs_tuners *od_tuners;
if (cpumask_test_cpu(cpu, &done))
continue;
shared = per_cpu(od_cpu_dbs_info, cpu).cdbs.shared;
if (!shared)
policy = cpufreq_cpu_get_raw(cpu);
if (!policy || policy->governor != CPU_FREQ_GOV_ONDEMAND)
continue;
policy = shared->policy;
cpumask_or(&done, &done, policy->cpus);
if (policy->governor != &cpufreq_gov_ondemand)
policy_dbs = policy->governor_data;
if (!policy_dbs)
continue;
dbs_data = policy->governor_data;
cpumask_or(&done, &done, policy->cpus);
dbs_data = policy_dbs->dbs_data;
od_tuners = dbs_data->tuners;
od_tuners->powersave_bias = default_powersave_bias;
}
......@@ -605,30 +483,14 @@ void od_unregister_powersave_bias_handler(void)
}
EXPORT_SYMBOL_GPL(od_unregister_powersave_bias_handler);
static int od_cpufreq_governor_dbs(struct cpufreq_policy *policy,
unsigned int event)
{
return cpufreq_governor_dbs(policy, &od_dbs_cdata, event);
}
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
static
#endif
struct cpufreq_governor cpufreq_gov_ondemand = {
.name = "ondemand",
.governor = od_cpufreq_governor_dbs,
.max_transition_latency = TRANSITION_LATENCY_LIMIT,
.owner = THIS_MODULE,
};
static int __init cpufreq_gov_dbs_init(void)
{
return cpufreq_register_governor(&cpufreq_gov_ondemand);
return cpufreq_register_governor(CPU_FREQ_GOV_ONDEMAND);
}
static void __exit cpufreq_gov_dbs_exit(void)
{
cpufreq_unregister_governor(&cpufreq_gov_ondemand);
cpufreq_unregister_governor(CPU_FREQ_GOV_ONDEMAND);
}
MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>");
......@@ -638,6 +500,11 @@ MODULE_DESCRIPTION("'cpufreq_ondemand' - A dynamic cpufreq governor for "
MODULE_LICENSE("GPL");
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
struct cpufreq_governor *cpufreq_default_governor(void)
{
return CPU_FREQ_GOV_ONDEMAND;
}
fs_initcall(cpufreq_gov_dbs_init);
#else
module_init(cpufreq_gov_dbs_init);
......
/*
* Header file for CPUFreq ondemand governor and related code.
*
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include "cpufreq_governor.h"
struct od_policy_dbs_info {
struct policy_dbs_info policy_dbs;
struct cpufreq_frequency_table *freq_table;
unsigned int freq_lo;
unsigned int freq_lo_delay_us;
unsigned int freq_hi_delay_us;
unsigned int sample_type:1;
};
static inline struct od_policy_dbs_info *to_dbs_info(struct policy_dbs_info *policy_dbs)
{
return container_of(policy_dbs, struct od_policy_dbs_info, policy_dbs);
}
struct od_dbs_tuners {
unsigned int powersave_bias;
};
......@@ -33,10 +33,7 @@ static int cpufreq_governor_performance(struct cpufreq_policy *policy,
return 0;
}
#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE_MODULE
static
#endif
struct cpufreq_governor cpufreq_gov_performance = {
static struct cpufreq_governor cpufreq_gov_performance = {
.name = "performance",
.governor = cpufreq_governor_performance,
.owner = THIS_MODULE,
......@@ -52,6 +49,19 @@ static void __exit cpufreq_gov_performance_exit(void)
cpufreq_unregister_governor(&cpufreq_gov_performance);
}
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE
struct cpufreq_governor *cpufreq_default_governor(void)
{
return &cpufreq_gov_performance;
}
#endif
#ifndef CONFIG_CPU_FREQ_GOV_PERFORMANCE_MODULE
struct cpufreq_governor *cpufreq_fallback_governor(void)
{
return &cpufreq_gov_performance;
}
#endif
MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
MODULE_DESCRIPTION("CPUfreq policy governor 'performance'");
MODULE_LICENSE("GPL");
......
......@@ -33,10 +33,7 @@ static int cpufreq_governor_powersave(struct cpufreq_policy *policy,
return 0;
}
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE
static
#endif
struct cpufreq_governor cpufreq_gov_powersave = {
static struct cpufreq_governor cpufreq_gov_powersave = {
.name = "powersave",
.governor = cpufreq_governor_powersave,
.owner = THIS_MODULE,
......@@ -57,6 +54,11 @@ MODULE_DESCRIPTION("CPUfreq policy governor 'powersave'");
MODULE_LICENSE("GPL");
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE
struct cpufreq_governor *cpufreq_default_governor(void)
{
return &cpufreq_gov_powersave;
}
fs_initcall(cpufreq_gov_powersave_init);
#else
module_init(cpufreq_gov_powersave_init);
......
......@@ -89,10 +89,7 @@ static int cpufreq_governor_userspace(struct cpufreq_policy *policy,
return rc;
}
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE
static
#endif
struct cpufreq_governor cpufreq_gov_userspace = {
static struct cpufreq_governor cpufreq_gov_userspace = {
.name = "userspace",
.governor = cpufreq_governor_userspace,
.store_setspeed = cpufreq_set,
......@@ -116,6 +113,11 @@ MODULE_DESCRIPTION("CPUfreq policy governor 'userspace'");
MODULE_LICENSE("GPL");
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE
struct cpufreq_governor *cpufreq_default_governor(void)
{
return &cpufreq_gov_userspace;
}
fs_initcall(cpufreq_gov_userspace_init);
#else
module_init(cpufreq_gov_userspace_init);
......
......@@ -71,7 +71,7 @@ struct sample {
u64 mperf;
u64 tsc;
int freq;
ktime_t time;
u64 time;
};
struct pstate_data {
......@@ -103,13 +103,13 @@ struct _pid {
struct cpudata {
int cpu;
struct timer_list timer;
struct update_util_data update_util;
struct pstate_data pstate;
struct vid_data vid;
struct _pid pid;
ktime_t last_sample_time;
u64 last_sample_time;
u64 prev_aperf;
u64 prev_mperf;
u64 prev_tsc;
......@@ -120,6 +120,7 @@ struct cpudata {
static struct cpudata **all_cpu_data;
struct pstate_adjust_policy {
int sample_rate_ms;
s64 sample_rate_ns;
int deadband;
int setpoint;
int p_gain_pct;
......@@ -197,8 +198,8 @@ static struct perf_limits *limits = &powersave_limits;
static inline void pid_reset(struct _pid *pid, int setpoint, int busy,
int deadband, int integral) {
pid->setpoint = setpoint;
pid->deadband = deadband;
pid->setpoint = int_tofp(setpoint);
pid->deadband = int_tofp(deadband);
pid->integral = int_tofp(integral);
pid->last_err = int_tofp(setpoint) - int_tofp(busy);
}
......@@ -224,9 +225,9 @@ static signed int pid_calc(struct _pid *pid, int32_t busy)
int32_t pterm, dterm, fp_error;
int32_t integral_limit;
fp_error = int_tofp(pid->setpoint) - busy;
fp_error = pid->setpoint - busy;
if (abs(fp_error) <= int_tofp(pid->deadband))
if (abs(fp_error) <= pid->deadband)
return 0;
pterm = mul_fp(pid->p_gain, fp_error);
......@@ -286,7 +287,7 @@ static inline void update_turbo_state(void)
cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
}
static void intel_pstate_hwp_set(void)
static void intel_pstate_hwp_set(const struct cpumask *cpumask)
{
int min, hw_min, max, hw_max, cpu, range, adj_range;
u64 value, cap;
......@@ -296,9 +297,7 @@ static void intel_pstate_hwp_set(void)
hw_max = HWP_HIGHEST_PERF(cap);
range = hw_max - hw_min;
get_online_cpus();
for_each_online_cpu(cpu) {
for_each_cpu(cpu, cpumask) {
rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
adj_range = limits->min_perf_pct * range / 100;
min = hw_min + adj_range;
......@@ -317,7 +316,12 @@ static void intel_pstate_hwp_set(void)
value |= HWP_MAX_PERF(max);
wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
}
}
static void intel_pstate_hwp_set_online_cpus(void)
{
get_online_cpus();
intel_pstate_hwp_set(cpu_online_mask);
put_online_cpus();
}
......@@ -439,7 +443,7 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
limits->no_turbo = clamp_t(int, input, 0, 1);
if (hwp_active)
intel_pstate_hwp_set();
intel_pstate_hwp_set_online_cpus();
return count;
}
......@@ -465,7 +469,7 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
int_tofp(100));
if (hwp_active)
intel_pstate_hwp_set();
intel_pstate_hwp_set_online_cpus();
return count;
}
......@@ -490,7 +494,7 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
int_tofp(100));
if (hwp_active)
intel_pstate_hwp_set();
intel_pstate_hwp_set_online_cpus();
return count;
}
......@@ -531,6 +535,9 @@ static void __init intel_pstate_sysfs_expose_params(void)
static void intel_pstate_hwp_enable(struct cpudata *cpudata)
{
/* First disable HWP notification interrupt as we don't process them */
wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
}
......@@ -712,7 +719,7 @@ static void core_set_pstate(struct cpudata *cpudata, int pstate)
if (limits->no_turbo && !limits->turbo_disabled)
val |= (u64)1 << 32;
wrmsrl_on_cpu(cpudata->cpu, MSR_IA32_PERF_CTL, val);
wrmsrl(MSR_IA32_PERF_CTL, val);
}
static int knl_get_turbo_pstate(void)
......@@ -824,11 +831,11 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
* policy, or by cpu specific default values determined through
* experimentation.
*/
max_perf_adj = fp_toint(mul_fp(int_tofp(max_perf), limits->max_perf));
max_perf_adj = fp_toint(max_perf * limits->max_perf);
*max = clamp_t(int, max_perf_adj,
cpu->pstate.min_pstate, cpu->pstate.turbo_pstate);
min_perf = fp_toint(mul_fp(int_tofp(max_perf), limits->min_perf));
min_perf = fp_toint(max_perf * limits->min_perf);
*min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
}
......@@ -874,16 +881,10 @@ static inline void intel_pstate_calc_busy(struct cpudata *cpu)
core_pct = int_tofp(sample->aperf) * int_tofp(100);
core_pct = div64_u64(core_pct, int_tofp(sample->mperf));
sample->freq = fp_toint(
mul_fp(int_tofp(
cpu->pstate.max_pstate_physical *
cpu->pstate.scaling / 100),
core_pct));
sample->core_pct_busy = (int32_t)core_pct;
}
static inline void intel_pstate_sample(struct cpudata *cpu)
static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time)
{
u64 aperf, mperf;
unsigned long flags;
......@@ -893,14 +894,14 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
tsc = rdtsc();
if ((cpu->prev_mperf == mperf) || (cpu->prev_tsc == tsc)) {
if (cpu->prev_mperf == mperf || cpu->prev_tsc == tsc) {
local_irq_restore(flags);
return;
return false;
}
local_irq_restore(flags);
cpu->last_sample_time = cpu->sample.time;
cpu->sample.time = ktime_get();
cpu->sample.time = time;
cpu->sample.aperf = aperf;
cpu->sample.mperf = mperf;
cpu->sample.tsc = tsc;
......@@ -908,27 +909,16 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
cpu->sample.mperf -= cpu->prev_mperf;
cpu->sample.tsc -= cpu->prev_tsc;
intel_pstate_calc_busy(cpu);
cpu->prev_aperf = aperf;
cpu->prev_mperf = mperf;
cpu->prev_tsc = tsc;
return true;
}
static inline void intel_hwp_set_sample_time(struct cpudata *cpu)
{
int delay;
delay = msecs_to_jiffies(50);
mod_timer_pinned(&cpu->timer, jiffies + delay);
}
static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
static inline int32_t get_avg_frequency(struct cpudata *cpu)
{
int delay;
delay = msecs_to_jiffies(pid_params.sample_rate_ms);
mod_timer_pinned(&cpu->timer, jiffies + delay);
return div64_u64(cpu->pstate.max_pstate_physical * cpu->sample.aperf *
cpu->pstate.scaling, cpu->sample.mperf);
}
static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
......@@ -954,7 +944,6 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
mperf = cpu->sample.mperf + delta_iowait_mperf;
cpu->prev_cummulative_iowait = cummulative_iowait;
/*
* The load can be estimated as the ratio of the mperf counter
* running at a constant frequency during active periods
......@@ -970,8 +959,9 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
{
int32_t core_busy, max_pstate, current_pstate, sample_ratio;
s64 duration_us;
u32 sample_time;
u64 duration_ns;
intel_pstate_calc_busy(cpu);
/*
* core_busy is the ratio of actual performance to max
......@@ -990,18 +980,16 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
/*
* Since we have a deferred timer, it will not fire unless
* we are in C0. So, determine if the actual elapsed time
* is significantly greater (3x) than our sample interval. If it
* is, then we were idle for a long enough period of time
* to adjust our busyness.
* Since our utilization update callback will not run unless we are
* in C0, check if the actual elapsed time is significantly greater (3x)
* than our sample interval. If it is, then we were idle for a long
* enough period of time to adjust our busyness.
*/
sample_time = pid_params.sample_rate_ms * USEC_PER_MSEC;
duration_us = ktime_us_delta(cpu->sample.time,
cpu->last_sample_time);
if (duration_us > sample_time * 3) {
sample_ratio = div_fp(int_tofp(sample_time),
int_tofp(duration_us));
duration_ns = cpu->sample.time - cpu->last_sample_time;
if ((s64)duration_ns > pid_params.sample_rate_ns * 3
&& cpu->last_sample_time > 0) {
sample_ratio = div_fp(int_tofp(pid_params.sample_rate_ns),
int_tofp(duration_ns));
core_busy = mul_fp(core_busy, sample_ratio);
}
......@@ -1028,26 +1016,21 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
sample->mperf,
sample->aperf,
sample->tsc,
sample->freq);
get_avg_frequency(cpu));
}
static void intel_hwp_timer_func(unsigned long __data)
static void intel_pstate_update_util(struct update_util_data *data, u64 time,
unsigned long util, unsigned long max)
{
struct cpudata *cpu = (struct cpudata *) __data;
struct cpudata *cpu = container_of(data, struct cpudata, update_util);
u64 delta_ns = time - cpu->sample.time;
intel_pstate_sample(cpu);
intel_hwp_set_sample_time(cpu);
}
static void intel_pstate_timer_func(unsigned long __data)
{
struct cpudata *cpu = (struct cpudata *) __data;
intel_pstate_sample(cpu);
if ((s64)delta_ns >= pid_params.sample_rate_ns) {
bool sample_taken = intel_pstate_sample(cpu, time);
if (sample_taken && !hwp_active)
intel_pstate_adjust_busy_pstate(cpu);
intel_pstate_set_sample_time(cpu);
}
}
#define ICPU(model, policy) \
......@@ -1095,24 +1078,19 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
cpu->cpu = cpunum;
if (hwp_active)
if (hwp_active) {
intel_pstate_hwp_enable(cpu);
pid_params.sample_rate_ms = 50;
pid_params.sample_rate_ns = 50 * NSEC_PER_MSEC;
}
intel_pstate_get_cpu_pstates(cpu);
init_timer_deferrable(&cpu->timer);
cpu->timer.data = (unsigned long)cpu;
cpu->timer.expires = jiffies + HZ/100;
if (!hwp_active)
cpu->timer.function = intel_pstate_timer_func;
else
cpu->timer.function = intel_hwp_timer_func;
intel_pstate_busy_pid_reset(cpu);
intel_pstate_sample(cpu);
intel_pstate_sample(cpu, 0);
add_timer_on(&cpu->timer, cpunum);
cpu->update_util.func = intel_pstate_update_util;
cpufreq_set_update_util_data(cpunum, &cpu->update_util);
pr_debug("intel_pstate: controlling: cpu %d\n", cpunum);
......@@ -1128,7 +1106,7 @@ static unsigned int intel_pstate_get(unsigned int cpu_num)
if (!cpu)
return 0;
sample = &cpu->sample;
return sample->freq;
return get_avg_frequency(cpu);
}
static int intel_pstate_set_policy(struct cpufreq_policy *policy)
......@@ -1141,7 +1119,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
pr_debug("intel_pstate: set performance\n");
limits = &performance_limits;
if (hwp_active)
intel_pstate_hwp_set();
intel_pstate_hwp_set(policy->cpus);
return 0;
}
......@@ -1173,7 +1151,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
int_tofp(100));
if (hwp_active)
intel_pstate_hwp_set();
intel_pstate_hwp_set(policy->cpus);
return 0;
}
......@@ -1196,7 +1174,9 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
pr_debug("intel_pstate: CPU %d exiting\n", cpu_num);
del_timer_sync(&all_cpu_data[cpu_num]->timer);
cpufreq_set_update_util_data(cpu_num, NULL);
synchronize_sched();
if (hwp_active)
return;
......@@ -1260,6 +1240,7 @@ static int intel_pstate_msrs_not_valid(void)
static void copy_pid_params(struct pstate_adjust_policy *policy)
{
pid_params.sample_rate_ms = policy->sample_rate_ms;
pid_params.sample_rate_ns = pid_params.sample_rate_ms * NSEC_PER_MSEC;
pid_params.p_gain_pct = policy->p_gain_pct;
pid_params.i_gain_pct = policy->i_gain_pct;
pid_params.d_gain_pct = policy->d_gain_pct;
......@@ -1397,6 +1378,11 @@ static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; }
static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
#endif /* CONFIG_ACPI */
static const struct x86_cpu_id hwp_support_ids[] __initconst = {
{ X86_VENDOR_INTEL, 6, X86_MODEL_ANY, X86_FEATURE_HWP },
{}
};
static int __init intel_pstate_init(void)
{
int cpu, rc = 0;
......@@ -1406,17 +1392,16 @@ static int __init intel_pstate_init(void)
if (no_load)
return -ENODEV;
if (x86_match_cpu(hwp_support_ids) && !no_hwp) {
copy_cpu_funcs(&core_params.funcs);
hwp_active++;
goto hwp_cpu_matched;
}
id = x86_match_cpu(intel_pstate_cpu_ids);
if (!id)
return -ENODEV;
/*
* The Intel pstate driver will be ignored if the platform
* firmware has its own power management modes.
*/
if (intel_pstate_platform_pwr_mgmt_exists())
return -ENODEV;
cpu_def = (struct cpu_defaults *)id->driver_data;
copy_pid_params(&cpu_def->pid_policy);
......@@ -1425,17 +1410,20 @@ static int __init intel_pstate_init(void)
if (intel_pstate_msrs_not_valid())
return -ENODEV;
hwp_cpu_matched:
/*
* The Intel pstate driver will be ignored if the platform
* firmware has its own power management modes.
*/
if (intel_pstate_platform_pwr_mgmt_exists())
return -ENODEV;
pr_info("Intel P-state driver initializing.\n");
all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus());
if (!all_cpu_data)
return -ENOMEM;
if (static_cpu_has_safe(X86_FEATURE_HWP) && !no_hwp) {
pr_info("intel_pstate: HWP enabled\n");
hwp_active++;
}
if (!hwp_active && hwp_only)
goto out;
......@@ -1446,12 +1434,16 @@ static int __init intel_pstate_init(void)
intel_pstate_debug_expose_params();
intel_pstate_sysfs_expose_params();
if (hwp_active)
pr_info("intel_pstate: HWP enabled\n");
return rc;
out:
get_online_cpus();
for_each_online_cpu(cpu) {
if (all_cpu_data[cpu]) {
del_timer_sync(&all_cpu_data[cpu]->timer);
cpufreq_set_update_util_data(cpu, NULL);
synchronize_sched();
kfree(all_cpu_data[cpu]);
}
}
......
......@@ -28,6 +28,8 @@
#include <linux/of.h>
#include <linux/reboot.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <trace/events/power.h>
#include <asm/cputhreads.h>
#include <asm/firmware.h>
......@@ -42,13 +44,24 @@
static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
static bool rebooting, throttled, occ_reset;
static unsigned int *core_to_chip_map;
static const char * const throttle_reason[] = {
"No throttling",
"Power Cap",
"Processor Over Temperature",
"Power Supply Failure",
"Over Current",
"OCC Reset"
};
static struct chip {
unsigned int id;
bool throttled;
bool restore;
u8 throttle_reason;
cpumask_t mask;
struct work_struct throttle;
bool restore;
} *chips;
static int nr_chips;
......@@ -312,13 +325,14 @@ static inline unsigned int get_nominal_index(void)
static void powernv_cpufreq_throttle_check(void *data)
{
unsigned int cpu = smp_processor_id();
unsigned int chip_id = core_to_chip_map[cpu_core_index_of_thread(cpu)];
unsigned long pmsr;
int pmsr_pmax, i;
pmsr = get_pmspr(SPRN_PMSR);
for (i = 0; i < nr_chips; i++)
if (chips[i].id == cpu_to_chip_id(cpu))
if (chips[i].id == chip_id)
break;
/* Check for Pmax Capping */
......@@ -328,17 +342,17 @@ static void powernv_cpufreq_throttle_check(void *data)
goto next;
chips[i].throttled = true;
if (pmsr_pmax < powernv_pstate_info.nominal)
pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n",
pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n",
cpu, chips[i].id, pmsr_pmax,
powernv_pstate_info.nominal);
else
pr_info("CPU %d on Chip %u has Pmax reduced below turbo frequency (%d < %d)\n",
cpu, chips[i].id, pmsr_pmax,
powernv_pstate_info.max);
trace_powernv_throttle(chips[i].id,
throttle_reason[chips[i].throttle_reason],
pmsr_pmax);
} else if (chips[i].throttled) {
chips[i].throttled = false;
pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu,
chips[i].id, pmsr_pmax);
trace_powernv_throttle(chips[i].id,
throttle_reason[chips[i].throttle_reason],
pmsr_pmax);
}
/* Check if Psafe_mode_active is set in PMSR. */
......@@ -356,7 +370,7 @@ static void powernv_cpufreq_throttle_check(void *data)
if (throttled) {
pr_info("PMSR = %16lx\n", pmsr);
pr_crit("CPU Frequency could be throttled\n");
pr_warn("CPU Frequency could be throttled\n");
}
}
......@@ -423,18 +437,19 @@ void powernv_cpufreq_work_fn(struct work_struct *work)
{
struct chip *chip = container_of(work, struct chip, throttle);
unsigned int cpu;
cpumask_var_t mask;
cpumask_t mask;
smp_call_function_any(&chip->mask,
get_online_cpus();
cpumask_and(&mask, &chip->mask, cpu_online_mask);
smp_call_function_any(&mask,
powernv_cpufreq_throttle_check, NULL, 0);
if (!chip->restore)
return;
goto out;
chip->restore = false;
cpumask_copy(mask, &chip->mask);
for_each_cpu_and(cpu, mask, cpu_online_mask) {
int index, tcpu;
for_each_cpu(cpu, &mask) {
int index;
struct cpufreq_policy policy;
cpufreq_get_policy(&policy, cpu);
......@@ -442,20 +457,12 @@ void powernv_cpufreq_work_fn(struct work_struct *work)
policy.cur,
CPUFREQ_RELATION_C, &index);
powernv_cpufreq_target_index(&policy, index);
for_each_cpu(tcpu, policy.cpus)
cpumask_clear_cpu(tcpu, mask);
cpumask_andnot(&mask, &mask, policy.cpus);
}
out:
put_online_cpus();
}
static char throttle_reason[][30] = {
"No throttling",
"Power Cap",
"Processor Over Temperature",
"Power Supply Failure",
"Over Current",
"OCC Reset"
};
static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
unsigned long msg_type, void *_msg)
{
......@@ -481,7 +488,7 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
*/
if (!throttled) {
throttled = true;
pr_crit("CPU frequency is throttled for duration\n");
pr_warn("CPU frequency is throttled for duration\n");
}
break;
......@@ -505,24 +512,19 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
return 0;
}
if (omsg.throttle_status &&
for (i = 0; i < nr_chips; i++)
if (chips[i].id == omsg.chip)
break;
if (omsg.throttle_status >= 0 &&
omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS)
pr_info("OCC: Chip %u Pmax reduced due to %s\n",
(unsigned int)omsg.chip,
throttle_reason[omsg.throttle_status]);
else if (!omsg.throttle_status)
pr_info("OCC: Chip %u %s\n", (unsigned int)omsg.chip,
throttle_reason[omsg.throttle_status]);
else
return 0;
chips[i].throttle_reason = omsg.throttle_status;
for (i = 0; i < nr_chips; i++)
if (chips[i].id == omsg.chip) {
if (!omsg.throttle_status)
chips[i].restore = true;
schedule_work(&chips[i].throttle);
}
}
return 0;
}
......@@ -556,29 +558,54 @@ static int init_chip_info(void)
unsigned int chip[256];
unsigned int cpu, i;
unsigned int prev_chip_id = UINT_MAX;
cpumask_t cpu_mask;
int ret = -ENOMEM;
core_to_chip_map = kcalloc(cpu_nr_cores(), sizeof(unsigned int),
GFP_KERNEL);
if (!core_to_chip_map)
goto out;
for_each_possible_cpu(cpu) {
cpumask_copy(&cpu_mask, cpu_possible_mask);
for_each_cpu(cpu, &cpu_mask) {
unsigned int id = cpu_to_chip_id(cpu);
if (prev_chip_id != id) {
prev_chip_id = id;
chip[nr_chips++] = id;
}
core_to_chip_map[cpu_core_index_of_thread(cpu)] = id;
cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu));
}
chips = kmalloc_array(nr_chips, sizeof(struct chip), GFP_KERNEL);
chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL);
if (!chips)
return -ENOMEM;
goto free_chip_map;
for (i = 0; i < nr_chips; i++) {
chips[i].id = chip[i];
chips[i].throttled = false;
cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i]));
INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
chips[i].restore = false;
}
return 0;
free_chip_map:
kfree(core_to_chip_map);
out:
return ret;
}
static inline void clean_chip_info(void)
{
kfree(chips);
kfree(core_to_chip_map);
}
static inline void unregister_all_notifiers(void)
{
opal_message_notifier_unregister(OPAL_MSG_OCC,
&powernv_cpufreq_opal_nb);
unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
}
static int __init powernv_cpufreq_init(void)
......@@ -591,28 +618,35 @@ static int __init powernv_cpufreq_init(void)
/* Discover pstates from device tree and init */
rc = init_powernv_pstates();
if (rc) {
pr_info("powernv-cpufreq disabled. System does not support PState control\n");
return rc;
}
if (rc)
goto out;
/* Populate chip info */
rc = init_chip_info();
if (rc)
return rc;
goto out;
register_reboot_notifier(&powernv_cpufreq_reboot_nb);
opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb);
return cpufreq_register_driver(&powernv_cpufreq_driver);
rc = cpufreq_register_driver(&powernv_cpufreq_driver);
if (!rc)
return 0;
pr_info("Failed to register the cpufreq driver (%d)\n", rc);
unregister_all_notifiers();
clean_chip_info();
out:
pr_info("Platform driver disabled. System does not support PState control\n");
return rc;
}
module_init(powernv_cpufreq_init);
static void __exit powernv_cpufreq_exit(void)
{
unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
opal_message_notifier_unregister(OPAL_MSG_OCC,
&powernv_cpufreq_opal_nb);
cpufreq_unregister_driver(&powernv_cpufreq_driver);
unregister_all_notifiers();
clean_chip_info();
}
module_exit(powernv_cpufreq_exit);
......
......@@ -80,7 +80,6 @@ struct cpufreq_policy {
unsigned int last_policy; /* policy before unplug */
struct cpufreq_governor *governor; /* see below */
void *governor_data;
bool governor_enabled; /* governor start/stop flag */
char last_governor[CPUFREQ_NAME_LEN]; /* last governor used */
struct work_struct update; /* if update_policy() needs to be
......@@ -100,10 +99,6 @@ struct cpufreq_policy {
* - Any routine that will write to the policy structure and/or may take away
* the policy altogether (eg. CPU hotplug), will hold this lock in write
* mode before doing so.
*
* Additional rules:
* - Lock should not be held across
* __cpufreq_governor(data, CPUFREQ_GOV_POLICY_EXIT);
*/
struct rw_semaphore rwsem;
......@@ -464,29 +459,8 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
int cpufreq_register_governor(struct cpufreq_governor *governor);
void cpufreq_unregister_governor(struct cpufreq_governor *governor);
/* CPUFREQ DEFAULT GOVERNOR */
/*
* Performance governor is fallback governor if any other gov failed to auto
* load due latency restrictions
*/
#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE
extern struct cpufreq_governor cpufreq_gov_performance;
#endif
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE
#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_performance)
#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE)
extern struct cpufreq_governor cpufreq_gov_powersave;
#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_powersave)
#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE)
extern struct cpufreq_governor cpufreq_gov_userspace;
#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_userspace)
#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND)
extern struct cpufreq_governor cpufreq_gov_ondemand;
#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_ondemand)
#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
extern struct cpufreq_governor cpufreq_gov_conservative;
#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative)
#endif
struct cpufreq_governor *cpufreq_default_governor(void);
struct cpufreq_governor *cpufreq_fallback_governor(void);
/*********************************************************************
* FREQUENCY TABLE HELPERS *
......@@ -525,16 +499,6 @@ static inline void dev_pm_opp_free_cpufreq_table(struct device *dev,
}
#endif
static inline bool cpufreq_next_valid(struct cpufreq_frequency_table **pos)
{
while ((*pos)->frequency != CPUFREQ_TABLE_END)
if ((*pos)->frequency != CPUFREQ_ENTRY_INVALID)
return true;
else
(*pos)++;
return false;
}
/*
* cpufreq_for_each_entry - iterate over a cpufreq_frequency_table
* @pos: the cpufreq_frequency_table * to use as a loop cursor.
......@@ -552,7 +516,10 @@ static inline bool cpufreq_next_valid(struct cpufreq_frequency_table **pos)
*/
#define cpufreq_for_each_valid_entry(pos, table) \
for (pos = table; cpufreq_next_valid(&pos); pos++)
for (pos = table; pos->frequency != CPUFREQ_TABLE_END; pos++) \
if (pos->frequency == CPUFREQ_ENTRY_INVALID) \
continue; \
else
int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
struct cpufreq_frequency_table *table);
......
......@@ -3207,4 +3207,13 @@ static inline unsigned long rlimit_max(unsigned int limit)
return task_rlimit_max(current, limit);
}
#ifdef CONFIG_CPU_FREQ
struct update_util_data {
void (*func)(struct update_util_data *data,
u64 time, unsigned long util, unsigned long max);
};
void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
#endif /* CONFIG_CPU_FREQ */
#endif
......@@ -38,6 +38,28 @@ DEFINE_EVENT(cpu, cpu_idle,
TP_ARGS(state, cpu_id)
);
TRACE_EVENT(powernv_throttle,
TP_PROTO(int chip_id, const char *reason, int pmax),
TP_ARGS(chip_id, reason, pmax),
TP_STRUCT__entry(
__field(int, chip_id)
__string(reason, reason)
__field(int, pmax)
),
TP_fast_assign(
__entry->chip_id = chip_id;
__assign_str(reason, reason);
__entry->pmax = pmax;
),
TP_printk("Chip %d Pmax %d %s", __entry->chip_id,
__entry->pmax, __get_str(reason))
);
TRACE_EVENT(pstate_sample,
TP_PROTO(u32 core_busy,
......
......@@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
/*
* Scheduler code and data structures related to cpufreq.
*
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include "sched.h"
DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
/**
* cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer.
* @cpu: The CPU to set the pointer for.
* @data: New pointer value.
*
* Set and publish the update_util_data pointer for the given CPU. That pointer
* points to a struct update_util_data object containing a callback function
* to call from cpufreq_update_util(). That function will be called from an RCU
* read-side critical section, so it must not sleep.
*
* Callers must use RCU-sched callbacks to free any memory that might be
* accessed via the old update_util_data pointer or invoke synchronize_sched()
* right after this function to avoid use-after-free.
*/
void cpufreq_set_update_util_data(int cpu, struct update_util_data *data)
{
if (WARN_ON(data && !data->func))
return;
rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
}
EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data);
......@@ -726,6 +726,10 @@ static void update_curr_dl(struct rq *rq)
if (!dl_task(curr) || !on_dl_rq(dl_se))
return;
/* Kick cpufreq (see the comment in linux/cpufreq.h). */
if (cpu_of(rq) == smp_processor_id())
cpufreq_trigger_update(rq_clock(rq));
/*
* Consumed budget is computed considering the time as
* observed by schedulable tasks (excluding time spent
......
......@@ -2824,7 +2824,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
int cpu = cpu_of(rq_of(cfs_rq));
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);
/*
* Track task load average for carrying it to new CPU after migrated, and
......@@ -2836,6 +2837,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
update_tg_load_avg(cfs_rq, 0);
if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
unsigned long max = rq->cpu_capacity_orig;
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
* a real problem -- added to that it only calls on the local
* CPU, so if we enqueue remotely we'll miss an update, but
* the next tick/schedule should update.
*
* It will not get called when we go idle, because the idle
* thread is a different class (!fair), nor will the utilization
* number include things like RT tasks.
*
* As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that).
*
* See cpu_util().
*/
cpufreq_update_util(rq_clock(rq),
min(cfs_rq->avg.util_avg, max), max);
}
}
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
......
......@@ -945,6 +945,10 @@ static void update_curr_rt(struct rq *rq)
if (curr->sched_class != &rt_sched_class)
return;
/* Kick cpufreq (see the comment in linux/cpufreq.h). */
if (cpu_of(rq) == smp_processor_id())
cpufreq_trigger_update(rq_clock(rq));
delta_exec = rq_clock_task(rq) - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0))
return;
......
......@@ -1738,3 +1738,51 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
/**
* cpufreq_update_util - Take a note about CPU utilization changes.
* @time: Current time.
* @util: Current utilization.
* @max: Utilization ceiling.
*
* This function is called by the scheduler on every invocation of
* update_load_avg() on the CPU whose utilization is being updated.
*
* It can only be called from RCU-sched read-side critical sections.
*/
static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
{
struct update_util_data *data;
data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
if (data)
data->func(data, time, util, max);
}
/**
* cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
* @time: Current time.
*
* The way cpufreq is currently arranged requires it to evaluate the CPU
* performance state (frequency/voltage) on a regular basis to prevent it from
* being stuck in a completely inadequate performance level for too long.
* That is not guaranteed to happen if the updates are only triggered from CFS,
* though, because they may not be coming in if RT or deadline tasks are active
* all the time (or there are RT and DL tasks only).
*
* As a workaround for that issue, this function is called by the RT and DL
* sched classes to trigger extra cpufreq updates to prevent it from stalling,
* but that really is a band-aid. Going forward it should be replaced with
* solutions targeted more specifically at RT and DL tasks.
*/
static inline void cpufreq_trigger_update(u64 time)
{
cpufreq_update_util(time, ULONG_MAX, 0);
}
#else
static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
static inline void cpufreq_trigger_update(u64 time) {}
#endif /* CONFIG_CPU_FREQ */
......@@ -15,4 +15,5 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment