Commit 5e9d834a authored by David Rientjes's avatar David Rientjes Committed by Linus Torvalds

oom: sacrifice child with highest badness score for parent

When a task is chosen for oom kill, the oom killer first attempts to
sacrifice a child not sharing its parent's memory instead.  Unfortunately,
this often kills in a seemingly random fashion based on the ordering of
the selected task's child list.  Additionally, it is not guaranteed at all
to free a large amount of memory that we need to prevent additional oom
killing in the very near future.

Instead, we now only attempt to sacrifice the worst child not sharing its
parent's memory, if one exists.  The worst child is indicated with the
highest badness() score.  This serves two advantages: we kill a
memory-hogging task more often, and we allow the configurable
/proc/pid/oom_adj value to be considered as a factor in which child to
kill.

Reviewers may observe that the previous implementation would iterate
through the children and attempt to kill each until one was successful and
then the parent if none were found while the new code simply kills the
most memory-hogging task or the parent.  Note that the only time
oom_kill_task() fails, however, is when a child does not have an mm or has
a /proc/pid/oom_adj of OOM_DISABLE.  badness() returns 0 for both cases,
so the final oom_kill_task() will always succeed.
Signed-off-by: default avatarDavid Rientjes <rientjes@google.com>
Acked-by: default avatarRik van Riel <riel@redhat.com>
Acked-by: default avatarNick Piggin <npiggin@suse.de>
Acked-by: default avatarBalbir Singh <balbir@linux.vnet.ibm.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 6cf86ac6
...@@ -362,10 +362,10 @@ static void dump_tasks(const struct mem_cgroup *mem) ...@@ -362,10 +362,10 @@ static void dump_tasks(const struct mem_cgroup *mem)
static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
struct mem_cgroup *mem) struct mem_cgroup *mem)
{ {
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
"oom_adj=%d\n", "oom_adj=%d\n",
current->comm, gfp_mask, order, current->signal->oom_adj); current->comm, gfp_mask, order, current->signal->oom_adj);
task_lock(current);
cpuset_print_task_mems_allowed(current); cpuset_print_task_mems_allowed(current);
task_unlock(current); task_unlock(current);
dump_stack(); dump_stack();
...@@ -436,8 +436,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, ...@@ -436,8 +436,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
unsigned long points, struct mem_cgroup *mem, unsigned long points, struct mem_cgroup *mem,
const char *message) const char *message)
{ {
struct task_struct *c; struct task_struct *victim = p;
struct task_struct *child;
struct task_struct *t = p; struct task_struct *t = p;
unsigned long victim_points = 0;
struct timespec uptime;
if (printk_ratelimit()) if (printk_ratelimit())
dump_header(p, gfp_mask, order, mem); dump_header(p, gfp_mask, order, mem);
...@@ -451,22 +454,37 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, ...@@ -451,22 +454,37 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
return 0; return 0;
} }
printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", task_lock(p);
message, task_pid_nr(p), p->comm, points); pr_err("%s: Kill process %d (%s) score %lu or sacrifice child\n",
message, task_pid_nr(p), p->comm, points);
task_unlock(p);
/* Try to kill a child first */ /*
* If any of p's children has a different mm and is eligible for kill,
* the one with the highest badness() score is sacrificed for its
* parent. This attempts to lose the minimal amount of work done while
* still freeing memory.
*/
do_posix_clock_monotonic_gettime(&uptime);
do { do {
list_for_each_entry(c, &t->children, sibling) { list_for_each_entry(child, &t->children, sibling) {
if (c->mm == p->mm) unsigned long child_points;
if (child->mm == p->mm)
continue; continue;
if (mem && !task_in_mem_cgroup(c, mem)) if (mem && !task_in_mem_cgroup(child, mem))
continue; continue;
if (!oom_kill_task(c))
return 0; /* badness() returns 0 if the thread is unkillable */
child_points = badness(child, uptime.tv_sec);
if (child_points > victim_points) {
victim = child;
victim_points = child_points;
}
} }
} while_each_thread(p, t); } while_each_thread(p, t);
return oom_kill_task(p); return oom_kill_task(victim);
} }
#ifdef CONFIG_CGROUP_MEM_RES_CTLR #ifdef CONFIG_CGROUP_MEM_RES_CTLR
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment