Commit 44dba3d5 authored by Iulia Manda's avatar Iulia Manda Committed by Ingo Molnar

sched: Refactor task_struct to use numa_faults instead of numa_* pointers

This patch simplifies task_struct by removing the four numa_* pointers
in the same array and replacing them with the array pointer. By doing this,
on x86_64, the size of task_struct is reduced by 3 ulong pointers (24 bytes on
x86_64).

A new parameter is added to the task_faults_idx function so that it can return
an index to the correct offset, corresponding with the old precalculated
pointers.

All of the code in sched/ that depended on task_faults_idx and numa_* was
changed in order to match the new logic.
Signed-off-by: default avatarIulia Manda <iulia.manda21@gmail.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Cc: mgorman@suse.de
Cc: dave@stgolabs.net
Cc: riel@redhat.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20141031001331.GA30662@winterfellSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent cad3bb32
...@@ -1597,27 +1597,22 @@ struct task_struct { ...@@ -1597,27 +1597,22 @@ struct task_struct {
struct numa_group *numa_group; struct numa_group *numa_group;
/* /*
* Exponential decaying average of faults on a per-node basis. * numa_faults is an array split into four regions:
* Scheduling placement decisions are made based on the these counts. * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
* The values remain static for the duration of a PTE scan * in this precise order.
*/ *
unsigned long *numa_faults_memory; * faults_memory: Exponential decaying average of faults on a per-node
* basis. Scheduling placement decisions are made based on these
* counts. The values remain static for the duration of a PTE scan.
* faults_cpu: Track the nodes the process was running on when a NUMA
* hinting fault was incurred.
* faults_memory_buffer and faults_cpu_buffer: Record faults per node
* during the current scan window. When the scan completes, the counts
* in faults_memory and faults_cpu decay and these values are copied.
*/
unsigned long *numa_faults;
unsigned long total_numa_faults; unsigned long total_numa_faults;
/*
* numa_faults_buffer records faults per node during the current
* scan window. When the scan completes, the counts in
* numa_faults_memory decay and these values are copied.
*/
unsigned long *numa_faults_buffer_memory;
/*
* Track the nodes the process was running on when a NUMA hinting
* fault was incurred.
*/
unsigned long *numa_faults_cpu;
unsigned long *numa_faults_buffer_cpu;
/* /*
* numa_faults_locality tracks if faults recorded during the last * numa_faults_locality tracks if faults recorded during the last
* scan window were remote/local. The task scan period is adapted * scan window were remote/local. The task scan period is adapted
......
...@@ -1857,8 +1857,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) ...@@ -1857,8 +1857,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work; p->numa_work.next = &p->numa_work;
p->numa_faults_memory = NULL; p->numa_faults = NULL;
p->numa_faults_buffer_memory = NULL;
p->last_task_numa_placement = 0; p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0; p->last_sum_exec_runtime = 0;
......
...@@ -535,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) ...@@ -535,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
unsigned long nr_faults = -1; unsigned long nr_faults = -1;
int cpu_current, home_node; int cpu_current, home_node;
if (p->numa_faults_memory) if (p->numa_faults)
nr_faults = p->numa_faults_memory[2*node + i]; nr_faults = p->numa_faults[2*node + i];
cpu_current = !i ? (task_node(p) == node) : cpu_current = !i ? (task_node(p) == node) :
(pol && node_isset(node, pol->v.nodes)); (pol && node_isset(node, pol->v.nodes));
......
...@@ -896,18 +896,24 @@ pid_t task_numa_group_id(struct task_struct *p) ...@@ -896,18 +896,24 @@ pid_t task_numa_group_id(struct task_struct *p)
return p->numa_group ? p->numa_group->gid : 0; return p->numa_group ? p->numa_group->gid : 0;
} }
static inline int task_faults_idx(int nid, int priv) /*
* The averaged statistics, shared & private, memory & cpu,
* occupy the first half of the array. The second half of the
* array is for current counters, which are averaged into the
* first set by task_numa_placement.
*/
static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
{ {
return NR_NUMA_HINT_FAULT_TYPES * nid + priv; return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
} }
static inline unsigned long task_faults(struct task_struct *p, int nid) static inline unsigned long task_faults(struct task_struct *p, int nid)
{ {
if (!p->numa_faults_memory) if (!p->numa_faults)
return 0; return 0;
return p->numa_faults_memory[task_faults_idx(nid, 0)] + return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
p->numa_faults_memory[task_faults_idx(nid, 1)]; p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
} }
static inline unsigned long group_faults(struct task_struct *p, int nid) static inline unsigned long group_faults(struct task_struct *p, int nid)
...@@ -915,14 +921,14 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) ...@@ -915,14 +921,14 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
if (!p->numa_group) if (!p->numa_group)
return 0; return 0;
return p->numa_group->faults[task_faults_idx(nid, 0)] + return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
p->numa_group->faults[task_faults_idx(nid, 1)]; p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
} }
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
{ {
return group->faults_cpu[task_faults_idx(nid, 0)] + return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
group->faults_cpu[task_faults_idx(nid, 1)]; group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
} }
/* Handle placement on systems where not all nodes are directly connected. */ /* Handle placement on systems where not all nodes are directly connected. */
...@@ -1001,7 +1007,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid, ...@@ -1001,7 +1007,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid,
{ {
unsigned long faults, total_faults; unsigned long faults, total_faults;
if (!p->numa_faults_memory) if (!p->numa_faults)
return 0; return 0;
total_faults = p->total_numa_faults; total_faults = p->total_numa_faults;
...@@ -1517,7 +1523,7 @@ static void numa_migrate_preferred(struct task_struct *p) ...@@ -1517,7 +1523,7 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ; unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */ /* This task has no NUMA fault statistics yet */
if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
return; return;
/* Periodically retry migrating the task to the preferred node */ /* Periodically retry migrating the task to the preferred node */
...@@ -1779,18 +1785,23 @@ static void task_numa_placement(struct task_struct *p) ...@@ -1779,18 +1785,23 @@ static void task_numa_placement(struct task_struct *p)
/* Find the node with the highest number of faults */ /* Find the node with the highest number of faults */
for_each_online_node(nid) { for_each_online_node(nid) {
/* Keep track of the offsets in numa_faults array */
int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
unsigned long faults = 0, group_faults = 0; unsigned long faults = 0, group_faults = 0;
int priv, i; int priv;
for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
long diff, f_diff, f_weight; long diff, f_diff, f_weight;
i = task_faults_idx(nid, priv); mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
/* Decay existing window, copy faults since last scan */ /* Decay existing window, copy faults since last scan */
diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
fault_types[priv] += p->numa_faults_buffer_memory[i]; fault_types[priv] += p->numa_faults[membuf_idx];
p->numa_faults_buffer_memory[i] = 0; p->numa_faults[membuf_idx] = 0;
/* /*
* Normalize the faults_from, so all tasks in a group * Normalize the faults_from, so all tasks in a group
...@@ -1800,21 +1811,27 @@ static void task_numa_placement(struct task_struct *p) ...@@ -1800,21 +1811,27 @@ static void task_numa_placement(struct task_struct *p)
* faults are less important. * faults are less important.
*/ */
f_weight = div64_u64(runtime << 16, period + 1); f_weight = div64_u64(runtime << 16, period + 1);
f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
(total_faults + 1); (total_faults + 1);
f_diff = f_weight - p->numa_faults_cpu[i] / 2; f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
p->numa_faults_buffer_cpu[i] = 0; p->numa_faults[cpubuf_idx] = 0;
p->numa_faults_memory[i] += diff; p->numa_faults[mem_idx] += diff;
p->numa_faults_cpu[i] += f_diff; p->numa_faults[cpu_idx] += f_diff;
faults += p->numa_faults_memory[i]; faults += p->numa_faults[mem_idx];
p->total_numa_faults += diff; p->total_numa_faults += diff;
if (p->numa_group) { if (p->numa_group) {
/* safe because we can only change our own group */ /*
p->numa_group->faults[i] += diff; * safe because we can only change our own group
p->numa_group->faults_cpu[i] += f_diff; *
* mem_idx represents the offset for a given
* nid and priv in a specific region because it
* is at the beginning of the numa_faults array.
*/
p->numa_group->faults[mem_idx] += diff;
p->numa_group->faults_cpu[mem_idx] += f_diff;
p->numa_group->total_faults += diff; p->numa_group->total_faults += diff;
group_faults += p->numa_group->faults[i]; group_faults += p->numa_group->faults[mem_idx];
} }
} }
...@@ -1886,7 +1903,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, ...@@ -1886,7 +1903,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
node_set(task_node(current), grp->active_nodes); node_set(task_node(current), grp->active_nodes);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] = p->numa_faults_memory[i]; grp->faults[i] = p->numa_faults[i];
grp->total_faults = p->total_numa_faults; grp->total_faults = p->total_numa_faults;
...@@ -1945,8 +1962,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, ...@@ -1945,8 +1962,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
double_lock_irq(&my_grp->lock, &grp->lock); double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
my_grp->faults[i] -= p->numa_faults_memory[i]; my_grp->faults[i] -= p->numa_faults[i];
grp->faults[i] += p->numa_faults_memory[i]; grp->faults[i] += p->numa_faults[i];
} }
my_grp->total_faults -= p->total_numa_faults; my_grp->total_faults -= p->total_numa_faults;
grp->total_faults += p->total_numa_faults; grp->total_faults += p->total_numa_faults;
...@@ -1971,14 +1988,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, ...@@ -1971,14 +1988,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
void task_numa_free(struct task_struct *p) void task_numa_free(struct task_struct *p)
{ {
struct numa_group *grp = p->numa_group; struct numa_group *grp = p->numa_group;
void *numa_faults = p->numa_faults_memory; void *numa_faults = p->numa_faults;
unsigned long flags; unsigned long flags;
int i; int i;
if (grp) { if (grp) {
spin_lock_irqsave(&grp->lock, flags); spin_lock_irqsave(&grp->lock, flags);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] -= p->numa_faults_memory[i]; grp->faults[i] -= p->numa_faults[i];
grp->total_faults -= p->total_numa_faults; grp->total_faults -= p->total_numa_faults;
list_del(&p->numa_entry); list_del(&p->numa_entry);
...@@ -1988,10 +2005,7 @@ void task_numa_free(struct task_struct *p) ...@@ -1988,10 +2005,7 @@ void task_numa_free(struct task_struct *p)
put_numa_group(grp); put_numa_group(grp);
} }
p->numa_faults_memory = NULL; p->numa_faults = NULL;
p->numa_faults_buffer_memory = NULL;
p->numa_faults_cpu= NULL;
p->numa_faults_buffer_cpu = NULL;
kfree(numa_faults); kfree(numa_faults);
} }
...@@ -2014,24 +2028,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) ...@@ -2014,24 +2028,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
return; return;
/* Allocate buffer to track faults on a per-node basis */ /* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults_memory)) { if (unlikely(!p->numa_faults)) {
int size = sizeof(*p->numa_faults_memory) * int size = sizeof(*p->numa_faults) *
NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
if (!p->numa_faults_memory) if (!p->numa_faults)
return; return;
BUG_ON(p->numa_faults_buffer_memory);
/*
* The averaged statistics, shared & private, memory & cpu,
* occupy the first half of the array. The second half of the
* array is for current counters, which are averaged into the
* first set by task_numa_placement.
*/
p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
p->total_numa_faults = 0; p->total_numa_faults = 0;
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
} }
...@@ -2071,8 +2075,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) ...@@ -2071,8 +2075,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (migrated) if (migrated)
p->numa_pages_migrated += pages; p->numa_pages_migrated += pages;
p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
p->numa_faults_locality[local] += pages; p->numa_faults_locality[local] += pages;
} }
...@@ -5361,7 +5365,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) ...@@ -5361,7 +5365,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
struct numa_group *numa_group = rcu_dereference(p->numa_group); struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid; int src_nid, dst_nid;
if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
!(env->sd->flags & SD_NUMA)) { !(env->sd->flags & SD_NUMA)) {
return false; return false;
} }
...@@ -5400,7 +5404,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) ...@@ -5400,7 +5404,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
return false; return false;
if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
return false; return false;
src_nid = cpu_to_node(env->src_cpu); src_nid = cpu_to_node(env->src_cpu);
......
...@@ -709,6 +709,13 @@ extern bool find_numa_distance(int distance); ...@@ -709,6 +709,13 @@ extern bool find_numa_distance(int distance);
#endif #endif
#ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING
/* The regions in numa_faults array from task_struct */
enum numa_faults_stats {
NUMA_MEM = 0,
NUMA_CPU,
NUMA_MEMBUF,
NUMA_CPUBUF
};
extern void sched_setnuma(struct task_struct *p, int node); extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *, struct task_struct *); extern int migrate_swap(struct task_struct *, struct task_struct *);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment