sched: zap the migration init / cache-hot balancing code

the SMP load-balancer uses the boot-time migration-cost estimation code to attempt to improve the quality of balancing. The reason for this code is that the discrete priority queues do not preserve the order of scheduling accurately, so the load-balancer skips tasks that were running on a CPU 'recently'. this code is fundamental fragile: the boot-time migration cost detector doesnt really work on systems that had large L3 caches, it caused boot delays on large systems and the whole cache-hot concept made the balancing code pretty undeterministic as well. (and hey, i wrote most of it, so i can say it out loud that it sucks ;-) under CFS the same purpose of cache affinity can be achieved without any special cache-hot special-case: tasks are sorted in the 'timeline' tree and the SMP balancer picks tasks from the left side of the tree, thus the most cache-cold task is balanced automatically. Signed-off-by: Ingo Molnar <mingo@elte.hu>

sched: zap the migration init / cache-hot balancing code
the SMP load-balancer uses the boot-time migration-cost estimation code to attempt to improve the quality of balancing. The reason for this code is that the discrete priority queues do not preserve the order of scheduling accurately, so the load-balancer skips tasks that were running on a CPU 'recently'. this code is fundamental fragile: the boot-time migration cost detector doesnt really work on systems that had large L3 caches, it caused boot delays on large systems and the whole cache-hot concept made the balancing code pretty undeterministic as well. (and hey, i wrote most of it, so i can say it out loud that it sucks ;-) under CFS the same purpose of cache affinity can be achieved without any special cache-hot special-case: tasks are sorted in the 'timeline' tree and the SMP balancer picks tasks from the left side of the tree, thus the most cache-cold task is balanced automatically. Signed-off-by: Ingo Molnar <mingo@elte.hu>
0437e109 · Ingo Molnar · 0e6aca43 · 0437e109 · 0437e109 · 0437e109
Commit 0437e109 authored Jul 09, 2007 by Ingo Molnar
8 changed files
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1014,49 +1014,6 @@ and is between 256 and 4096 characters. It is defined in the file

 	mga=		[HW,DRM]

-	migration_cost=
-			[KNL,SMP] debug: override scheduler migration costs
-			Format: <level-1-usecs>,<level-2-usecs>,...
-			This debugging option can be used to override the
-			default scheduler migration cost matrix. The numbers
-			are indexed by 'CPU domain distance'.
-			E.g. migration_cost=1000,2000,3000 on an SMT NUMA
-			box will set up an intra-core migration cost of
-			1 msec, an inter-core migration cost of 2 msecs,
-			and an inter-node migration cost of 3 msecs.
-
-			WARNING: using the wrong values here can break
-			scheduler performance, so it's only for scheduler
-			development purposes, not production environments.
-
-	migration_debug=
-			[KNL,SMP] migration cost auto-detect verbosity
-			Format=<0|1|2>
-			If a system's migration matrix reported at bootup
-			seems erroneous then this option can be used to
-			increase verbosity of the detection process.
-			We default to 0 (no extra messages), 1 will print
-			some more information, and 2 will be really
-			verbose (probably only useful if you also have a
-			serial console attached to the system).
-
-	migration_factor=
-			[KNL,SMP] multiply/divide migration costs by a factor
-			Format=<percent>
-			This debug option can be used to proportionally
-			increase or decrease the auto-detected migration
-			costs for all entries of the migration matrix.
-			E.g. migration_factor=150 will increase migration
-			costs by 50%. (and thus the scheduler will be less
-			eager migrating cache-hot tasks)
-			migration_factor=80 will decrease migration costs
-			by 20%. (thus the scheduler will be more eager to
-			migrate tasks)
-
-			WARNING: using the wrong values here can break
-			scheduler performance, so it's only for scheduler
-			development purposes, not production environments.
-
 	mousedev.tap_time=
 			[MOUSE] Maximum time between finger touching and
 			leaving touchpad surface for touch to be considered

--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -941,17 +941,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
 }
 #endif

-static void smp_tune_scheduling(void)
-{
-	if (cpu_khz) {
-		/* cache size in kB */
-		long cachesize = boot_cpu_data.x86_cache_size;
-
-		if (cachesize > 0)
-			max_cache_size = cachesize * 1024;
-	}
-}
-
 /*
 * Cycle through the processors sending APIC IPIs to boot each.
 */
@@ -980,7 +969,6 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
 	x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;

 	current_thread_info()->cpu = 0;
-	smp_tune_scheduling();

 	set_cpu_sibling_map(0);


--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -805,7 +805,6 @@ static void __cpuinit
 get_max_cacheline_size (void)
 {
 	unsigned long line_size, max = 1;
-	unsigned int cache_size = 0;
 	u64 l, levels, unique_caches;
        pal_cache_config_info_t cci;
        s64 status;
@@ -835,8 +834,6 @@ get_max_cacheline_size (void)
 		line_size = 1 << cci.pcci_line_size;
 		if (line_size > max)
 			max = line_size;
-		if (cache_size < cci.pcci_cache_size)
-			cache_size = cci.pcci_cache_size;
 		if (!cci.pcci_unified) {
 			status = ia64_pal_cache_config_info(l,
 						    /* cache_type (instruction)= */ 1,
@@ -853,9 +850,6 @@ get_max_cacheline_size (void)
 			ia64_i_cache_stride_shift = cci.pcci_stride;
 	}
  out:
-#ifdef CONFIG_SMP
-	max_cache_size = max(max_cache_size, cache_size);
-#endif
 	if (max > ia64_max_cacheline_size)
 		ia64_max_cacheline_size = max;
 }

--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -51,16 +51,6 @@ int __cpu_logical_map[NR_CPUS];		/* Map logical to physical */
 EXPORT_SYMBOL(phys_cpu_present_map);
 EXPORT_SYMBOL(cpu_online_map);

-/* This happens early in bootup, can't really do it better */
-static void smp_tune_scheduling (void)
-{
-	struct cache_desc *cd = &current_cpu_data.scache;
-	unsigned long cachesize = cd->linesz * cd->sets * cd->ways;
-
-	if (cachesize > max_cache_size)
-		max_cache_size = cachesize;
-}
-
 extern void __init calibrate_delay(void);
 extern ATTRIB_NORET void cpu_idle(void);

@@ -228,7 +218,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	init_new_context(current, &init_mm);
 	current_thread_info()->cpu = 0;
-	smp_tune_scheduling();
 	plat_prepare_cpus(max_cpus);
 #ifndef CONFIG_HOTPLUG_CPU
 	cpu_present_map = cpu_possible_map;

--- a/arch/sparc/kernel/smp.c
+++ b/arch/sparc/kernel/smp.c
@@ -68,16 +68,6 @@ void __cpuinit smp_store_cpu_info(int id)
 	cpu_data(id).prom_node = cpu_node;
 	cpu_data(id).mid = cpu_get_hwmid(cpu_node);

-	/* this is required to tune the scheduler correctly */
-	/* is it possible to have CPUs with different cache sizes? */
-	if (id == boot_cpu_id) {
-		int cache_line,cache_nlines;
-		cache_line = 0x20;
-		cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line);
-		cache_nlines = 0x8000;
-		cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines);
-		max_cache_size = cache_line * cache_nlines;
-	}
 	if (cpu_data(id).mid < 0)
 		panic("No MID found for CPU%d at node 0x%08d", id, cpu_node);
 }

--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -1163,32 +1163,6 @@ int setup_profiling_timer(unsigned int multiplier)
 	return -EINVAL;
 }

-static void __init smp_tune_scheduling(void)
-{
-	unsigned int smallest = ~0U;
-	int i;
-
-	for (i = 0; i < NR_CPUS; i++) {
-		unsigned int val = cpu_data(i).ecache_size;
-
-		if (val && val < smallest)
-			smallest = val;
-	}
-
-	/* Any value less than 256K is nonsense.  */
-	if (smallest < (256U * 1024U))
-		smallest = 256 * 1024;
-
-	max_cache_size = smallest;
-
-	if (smallest < 1U * 1024U * 1024U)
-		printk(KERN_INFO "Using max_cache_size of %uKB\n",
-		       smallest / 1024U);
-	else
-		printk(KERN_INFO "Using max_cache_size of %uMB\n",
-		       smallest / 1024U / 1024U);
-}
-
 /* Constrain the number of cpus to max_cpus.  */
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
@@ -1206,7 +1180,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	}

 	cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy;
-	smp_tune_scheduling();
 }

 void __devinit smp_prepare_boot_cpu(void)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -754,12 +754,6 @@ struct sched_domain {
 extern int partition_sched_domains(cpumask_t *partition1,
 				    cpumask_t *partition2);

-/*
- * Maximum cache size the migration-costs auto-tuning code will
- * search from:
- */
-extern unsigned int max_cache_size;
-
 #endif	/* CONFIG_SMP */



--- a/kernel/sched.c
+++ b/kernel/sched.c