Commit 6fe6b2d6 authored by Rik van Riel's avatar Rik van Riel Committed by Ingo Molnar

sched/numa: Do not migrate memory immediately after switching node

The load balancer can move tasks between nodes and does not take NUMA
locality into account. With automatic NUMA balancing this may result in the
tasks working set being migrated to the new node. However, as the fault
buffer will still store faults from the old node the schduler may decide to
reset the preferred node and migrate the task back resulting in more
migrations.

The ideal would be that the scheduler did not migrate tasks with a heavy
memory footprint but this may result nodes being overloaded. We could
also discard the fault information on task migration but this would still
cause all the tasks working set to be migrated. This patch simply avoids
migrating the memory for a short time after a task is migrated.
Signed-off-by: default avatarRik van Riel <riel@redhat.com>
Signed-off-by: default avatarMel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: default avatarPeter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1381141781-10992-31-git-send-email-mgorman@suse.deSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent b795854b
...@@ -1631,7 +1631,7 @@ static void __sched_fork(struct task_struct *p) ...@@ -1631,7 +1631,7 @@ static void __sched_fork(struct task_struct *p)
p->node_stamp = 0ULL; p->node_stamp = 0ULL;
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_migrate_seq = 0; p->numa_migrate_seq = 1;
p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_preferred_nid = -1; p->numa_preferred_nid = -1;
p->numa_work.next = &p->numa_work; p->numa_work.next = &p->numa_work;
......
...@@ -884,7 +884,7 @@ static unsigned int task_scan_max(struct task_struct *p) ...@@ -884,7 +884,7 @@ static unsigned int task_scan_max(struct task_struct *p)
* the preferred node but still allow the scheduler to move the task again if * the preferred node but still allow the scheduler to move the task again if
* the nodes CPUs are overloaded. * the nodes CPUs are overloaded.
*/ */
unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3; unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
static inline int task_faults_idx(int nid, int priv) static inline int task_faults_idx(int nid, int priv)
{ {
...@@ -980,7 +980,7 @@ static void task_numa_placement(struct task_struct *p) ...@@ -980,7 +980,7 @@ static void task_numa_placement(struct task_struct *p)
/* Update the preferred nid and migrate task if possible */ /* Update the preferred nid and migrate task if possible */
p->numa_preferred_nid = max_nid; p->numa_preferred_nid = max_nid;
p->numa_migrate_seq = 0; p->numa_migrate_seq = 1;
migrate_task_to(p, preferred_cpu); migrate_task_to(p, preferred_cpu);
} }
} }
...@@ -4121,6 +4121,20 @@ static void move_task(struct task_struct *p, struct lb_env *env) ...@@ -4121,6 +4121,20 @@ static void move_task(struct task_struct *p, struct lb_env *env)
set_task_cpu(p, env->dst_cpu); set_task_cpu(p, env->dst_cpu);
activate_task(env->dst_rq, p, 0); activate_task(env->dst_rq, p, 0);
check_preempt_curr(env->dst_rq, p, 0); check_preempt_curr(env->dst_rq, p, 0);
#ifdef CONFIG_NUMA_BALANCING
if (p->numa_preferred_nid != -1) {
int src_nid = cpu_to_node(env->src_cpu);
int dst_nid = cpu_to_node(env->dst_cpu);
/*
* If the load balancer has moved the task then limit
* migrations from taking place in the short term in
* case this is a short-lived migration.
*/
if (src_nid != dst_nid && dst_nid != p->numa_preferred_nid)
p->numa_migrate_seq = 0;
}
#endif
} }
/* /*
......
...@@ -2378,6 +2378,18 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long ...@@ -2378,6 +2378,18 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
last_nidpid = page_nidpid_xchg_last(page, this_nidpid); last_nidpid = page_nidpid_xchg_last(page, this_nidpid);
if (!nidpid_pid_unset(last_nidpid) && nidpid_to_nid(last_nidpid) != polnid) if (!nidpid_pid_unset(last_nidpid) && nidpid_to_nid(last_nidpid) != polnid)
goto out; goto out;
#ifdef CONFIG_NUMA_BALANCING
/*
* If the scheduler has just moved us away from our
* preferred node, do not bother migrating pages yet.
* This way a short and temporary process migration will
* not cause excessive memory migration.
*/
if (polnid != current->numa_preferred_nid &&
!current->numa_migrate_seq)
goto out;
#endif
} }
if (curnid != polnid) if (curnid != polnid)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment