Commit 66558b73 authored by Tim Chen's avatar Tim Chen Committed by Peter Zijlstra

sched: Add cluster scheduler level for x86

There are x86 CPU architectures (e.g. Jacobsville) where L2 cahce is
shared among a cluster of cores instead of being exclusive to one
single core.

To prevent oversubscription of L2 cache, load should be balanced
between such L2 clusters, especially for tasks with no shared data.
On benchmark such as SPECrate mcf test, this change provides a boost
to performance especially on medium load system on Jacobsville.  on a
Jacobsville that has 24 Atom cores, arranged into 6 clusters of 4
cores each, the benchmark number is as follow:

 Improvement over baseline kernel for mcf_r
 copies		run time	base rate
 1		-0.1%		-0.2%
 6		25.1%		25.1%
 12		18.8%		19.0%
 24		0.3%		0.3%

So this looks pretty good. In terms of the system's task distribution,
some pretty bad clumping can be seen for the vanilla kernel without
the L2 cluster domain for the 6 and 12 copies case. With the extra
domain for cluster, the load does get evened out between the clusters.

Note this patch isn't an universal win as spreading isn't necessarily
a win, particually for those workload who can benefit from packing.
Signed-off-by: default avatarTim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: default avatarBarry Song <song.bao.hua@hisilicon.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210924085104.44806-4-21cnbao@gmail.com
parent 778c558f
...@@ -1001,6 +1001,17 @@ config NR_CPUS ...@@ -1001,6 +1001,17 @@ config NR_CPUS
This is purely to save memory: each supported CPU adds about 8KB This is purely to save memory: each supported CPU adds about 8KB
to the kernel image. to the kernel image.
config SCHED_CLUSTER
bool "Cluster scheduler support"
depends on SMP
default y
help
Cluster scheduler support improves the CPU scheduler's decision
making when dealing with machines that have clusters of CPUs.
Cluster usually means a couple of CPUs which are placed closely
by sharing mid-level caches, last-level cache tags or internal
busses.
config SCHED_SMT config SCHED_SMT
def_bool y if SMP def_bool y if SMP
......
...@@ -16,7 +16,9 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); ...@@ -16,7 +16,9 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
/* cpus sharing the last level cache: */ /* cpus sharing the last level cache: */
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
static inline struct cpumask *cpu_llc_shared_mask(int cpu) static inline struct cpumask *cpu_llc_shared_mask(int cpu)
...@@ -24,6 +26,11 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) ...@@ -24,6 +26,11 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
return per_cpu(cpu_llc_shared_map, cpu); return per_cpu(cpu_llc_shared_map, cpu);
} }
static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
{
return per_cpu(cpu_l2c_shared_map, cpu);
}
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
......
...@@ -103,6 +103,7 @@ static inline void setup_node_to_cpumask_map(void) { } ...@@ -103,6 +103,7 @@ static inline void setup_node_to_cpumask_map(void) { }
#include <asm-generic/topology.h> #include <asm-generic/topology.h>
extern const struct cpumask *cpu_coregroup_mask(int cpu); extern const struct cpumask *cpu_coregroup_mask(int cpu);
extern const struct cpumask *cpu_clustergroup_mask(int cpu);
#define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id) #define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id)
#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
...@@ -113,7 +114,9 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); ...@@ -113,7 +114,9 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
extern unsigned int __max_die_per_package; extern unsigned int __max_die_per_package;
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#define topology_cluster_id(cpu) (per_cpu(cpu_l2c_id, cpu))
#define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu)) #define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu))
#define topology_cluster_cpumask(cpu) (cpu_clustergroup_mask(cpu))
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
......
...@@ -846,6 +846,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) ...@@ -846,6 +846,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
l2 = new_l2; l2 = new_l2;
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
per_cpu(cpu_llc_id, cpu) = l2_id; per_cpu(cpu_llc_id, cpu) = l2_id;
per_cpu(cpu_l2c_id, cpu) = l2_id;
#endif #endif
} }
......
...@@ -85,6 +85,9 @@ u16 get_llc_id(unsigned int cpu) ...@@ -85,6 +85,9 @@ u16 get_llc_id(unsigned int cpu)
} }
EXPORT_SYMBOL_GPL(get_llc_id); EXPORT_SYMBOL_GPL(get_llc_id);
/* L2 cache ID of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID;
/* correctly size the local cpu masks */ /* correctly size the local cpu masks */
void __init setup_cpu_local_masks(void) void __init setup_cpu_local_masks(void)
{ {
......
...@@ -101,6 +101,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map); ...@@ -101,6 +101,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map);
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
/* Per CPU bogomips and other parameters */ /* Per CPU bogomips and other parameters */
DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info);
...@@ -464,6 +466,21 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) ...@@ -464,6 +466,21 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false; return false;
} }
static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
/* Do not match if we do not have a valid APICID for cpu: */
if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
return false;
/* Do not match if L2 cache id does not match: */
if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))
return false;
return topology_sane(c, o, "l2c");
}
/* /*
* Unlike the other levels, we do not enforce keeping a * Unlike the other levels, we do not enforce keeping a
* multicore group inside a NUMA node. If this happens, we will * multicore group inside a NUMA node. If this happens, we will
...@@ -523,7 +540,7 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) ...@@ -523,7 +540,7 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
} }
#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC)
static inline int x86_sched_itmt_flags(void) static inline int x86_sched_itmt_flags(void)
{ {
return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
...@@ -541,12 +558,21 @@ static int x86_smt_flags(void) ...@@ -541,12 +558,21 @@ static int x86_smt_flags(void)
return cpu_smt_flags() | x86_sched_itmt_flags(); return cpu_smt_flags() | x86_sched_itmt_flags();
} }
#endif #endif
#ifdef CONFIG_SCHED_CLUSTER
static int x86_cluster_flags(void)
{
return cpu_cluster_flags() | x86_sched_itmt_flags();
}
#endif
#endif #endif
static struct sched_domain_topology_level x86_numa_in_package_topology[] = { static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
#ifdef CONFIG_SCHED_SMT #ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif #endif
#ifdef CONFIG_SCHED_CLUSTER
{ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
#endif
#ifdef CONFIG_SCHED_MC #ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif #endif
...@@ -557,6 +583,9 @@ static struct sched_domain_topology_level x86_topology[] = { ...@@ -557,6 +583,9 @@ static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT #ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif #endif
#ifdef CONFIG_SCHED_CLUSTER
{ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
#endif
#ifdef CONFIG_SCHED_MC #ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif #endif
...@@ -584,6 +613,7 @@ void set_cpu_sibling_map(int cpu) ...@@ -584,6 +613,7 @@ void set_cpu_sibling_map(int cpu)
if (!has_mp) { if (!has_mp) {
cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu)); cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu));
cpumask_set_cpu(cpu, topology_core_cpumask(cpu)); cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
cpumask_set_cpu(cpu, topology_die_cpumask(cpu)); cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
c->booted_cores = 1; c->booted_cores = 1;
...@@ -602,6 +632,9 @@ void set_cpu_sibling_map(int cpu) ...@@ -602,6 +632,9 @@ void set_cpu_sibling_map(int cpu)
if ((i == cpu) || (has_mp && match_llc(c, o))) if ((i == cpu) || (has_mp && match_llc(c, o)))
link_mask(cpu_llc_shared_mask, cpu, i); link_mask(cpu_llc_shared_mask, cpu, i);
if ((i == cpu) || (has_mp && match_l2c(c, o)))
link_mask(cpu_l2c_shared_mask, cpu, i);
if ((i == cpu) || (has_mp && match_die(c, o))) if ((i == cpu) || (has_mp && match_die(c, o)))
link_mask(topology_die_cpumask, cpu, i); link_mask(topology_die_cpumask, cpu, i);
} }
...@@ -652,6 +685,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu) ...@@ -652,6 +685,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
return cpu_llc_shared_mask(cpu); return cpu_llc_shared_mask(cpu);
} }
const struct cpumask *cpu_clustergroup_mask(int cpu)
{
return cpu_l2c_shared_mask(cpu);
}
static void impress_friends(void) static void impress_friends(void)
{ {
int cpu; int cpu;
...@@ -1335,6 +1373,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) ...@@ -1335,6 +1373,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
} }
/* /*
...@@ -1564,7 +1603,10 @@ static void remove_siblinginfo(int cpu) ...@@ -1564,7 +1603,10 @@ static void remove_siblinginfo(int cpu)
for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
for_each_cpu(sibling, cpu_l2c_shared_mask(cpu))
cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling));
cpumask_clear(cpu_llc_shared_mask(cpu)); cpumask_clear(cpu_llc_shared_mask(cpu));
cpumask_clear(cpu_l2c_shared_mask(cpu));
cpumask_clear(topology_sibling_cpumask(cpu)); cpumask_clear(topology_sibling_cpumask(cpu));
cpumask_clear(topology_core_cpumask(cpu)); cpumask_clear(topology_core_cpumask(cpu));
cpumask_clear(topology_die_cpumask(cpu)); cpumask_clear(topology_die_cpumask(cpu));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment