Commit 4b534334 authored by KAMEZAWA Hiroyuki's avatar KAMEZAWA Hiroyuki Committed by Linus Torvalds

memcg: clean up try_charge main loop

mem_cgroup_try_charge() has a big loop in it and seems to be hard to read.
 Most of routines are for slow path.  This patch moves codes out from the
loop and make it clear what's done.

Summary:
 - refactoring a function to detect a memcg is under acccount move or not.
 - refactoring a function to wait for the end of moving task acct.
 - refactoring a main loop('s slow path) as a function and make it clear
   why we retry or quit by return code.
 - add fatal_signal_pending() check for bypassing charge loops.
Signed-off-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 65e0e811
...@@ -1047,6 +1047,49 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) ...@@ -1047,6 +1047,49 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
return swappiness; return swappiness;
} }
/* A routine for testing mem is not under move_account */
static bool mem_cgroup_under_move(struct mem_cgroup *mem)
{
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
bool ret = false;
if (from == mem || to == mem)
return true;
if (!from || !to || !mem->use_hierarchy)
return false;
rcu_read_lock();
if (css_tryget(&from->css)) {
ret = css_is_ancestor(&from->css, &mem->css);
css_put(&from->css);
}
if (!ret && css_tryget(&to->css)) {
ret = css_is_ancestor(&to->css, &mem->css);
css_put(&to->css);
}
rcu_read_unlock();
return ret;
}
static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
{
if (mc.moving_task && current != mc.moving_task) {
if (mem_cgroup_under_move(mem)) {
DEFINE_WAIT(wait);
prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
/* moving charge context might have finished. */
if (mc.moving_task)
schedule();
finish_wait(&mc.waitq, &wait);
return true;
}
}
return false;
}
static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
{ {
int *val = data; int *val = data;
...@@ -1575,16 +1618,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, ...@@ -1575,16 +1618,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
return NOTIFY_OK; return NOTIFY_OK;
} }
/* See __mem_cgroup_try_charge() for details */
enum {
CHARGE_OK, /* success */
CHARGE_RETRY, /* need to retry but retry is not bad */
CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
CHARGE_OOM_DIE, /* the current is killed because of OOM */
};
static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
int csize, bool oom_check)
{
struct mem_cgroup *mem_over_limit;
struct res_counter *fail_res;
unsigned long flags = 0;
int ret;
ret = res_counter_charge(&mem->res, csize, &fail_res);
if (likely(!ret)) {
if (!do_swap_account)
return CHARGE_OK;
ret = res_counter_charge(&mem->memsw, csize, &fail_res);
if (likely(!ret))
return CHARGE_OK;
mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
} else
mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
if (csize > PAGE_SIZE) /* change csize and retry */
return CHARGE_RETRY;
if (!(gfp_mask & __GFP_WAIT))
return CHARGE_WOULDBLOCK;
ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
gfp_mask, flags);
/*
* try_to_free_mem_cgroup_pages() might not give us a full
* picture of reclaim. Some pages are reclaimed and might be
* moved to swap cache or just unmapped from the cgroup.
* Check the limit again to see if the reclaim reduced the
* current usage of the cgroup before giving up
*/
if (ret || mem_cgroup_check_under_limit(mem_over_limit))
return CHARGE_RETRY;
/*
* At task move, charge accounts can be doubly counted. So, it's
* better to wait until the end of task_move if something is going on.
*/
if (mem_cgroup_wait_acct_move(mem_over_limit))
return CHARGE_RETRY;
/* If we don't need to call oom-killer at el, return immediately */
if (!oom_check)
return CHARGE_NOMEM;
/* check OOM */
if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
return CHARGE_OOM_DIE;
return CHARGE_RETRY;
}
/* /*
* Unlike exported interface, "oom" parameter is added. if oom==true, * Unlike exported interface, "oom" parameter is added. if oom==true,
* oom-killer can be invoked. * oom-killer can be invoked.
*/ */
static int __mem_cgroup_try_charge(struct mm_struct *mm, static int __mem_cgroup_try_charge(struct mm_struct *mm,
gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
{ {
struct mem_cgroup *mem, *mem_over_limit; int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem = NULL;
struct res_counter *fail_res; int ret;
int csize = CHARGE_SIZE; int csize = CHARGE_SIZE;
/* /*
...@@ -1602,120 +1712,56 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, ...@@ -1602,120 +1712,56 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
* thread group leader migrates. It's possible that mm is not * thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage). * set, if so charge the init_mm (happens for pagecache usage).
*/ */
mem = *memcg; if (*memcg) {
if (likely(!mem)) { mem = *memcg;
css_get(&mem->css);
} else {
mem = try_get_mem_cgroup_from_mm(mm); mem = try_get_mem_cgroup_from_mm(mm);
if (unlikely(!mem))
return 0;
*memcg = mem; *memcg = mem;
} else {
css_get(&mem->css);
} }
if (unlikely(!mem))
return 0;
VM_BUG_ON(css_is_removed(&mem->css)); VM_BUG_ON(css_is_removed(&mem->css));
if (mem_cgroup_is_root(mem)) if (mem_cgroup_is_root(mem))
goto done; goto done;
while (1) { do {
int ret = 0; bool oom_check;
unsigned long flags = 0;
if (consume_stock(mem)) if (consume_stock(mem))
goto done; goto done; /* don't need to fill stock */
/* If killed, bypass charge */
ret = res_counter_charge(&mem->res, csize, &fail_res); if (fatal_signal_pending(current))
if (likely(!ret)) { goto bypass;
if (!do_swap_account)
break;
ret = res_counter_charge(&mem->memsw, csize, &fail_res);
if (likely(!ret))
break;
/* mem+swap counter fails */
res_counter_uncharge(&mem->res, csize);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
mem_over_limit = mem_cgroup_from_res_counter(fail_res,
memsw);
} else
/* mem counter fails */
mem_over_limit = mem_cgroup_from_res_counter(fail_res,
res);
/* reduce request size and retry */ oom_check = false;
if (csize > PAGE_SIZE) { if (oom && !nr_oom_retries) {
csize = PAGE_SIZE; oom_check = true;
continue; nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
} }
if (!(gfp_mask & __GFP_WAIT))
goto nomem;
ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
gfp_mask, flags);
if (ret)
continue;
/* switch (ret) {
* try_to_free_mem_cgroup_pages() might not give us a full case CHARGE_OK:
* picture of reclaim. Some pages are reclaimed and might be break;
* moved to swap cache or just unmapped from the cgroup. case CHARGE_RETRY: /* not in OOM situation but retry */
* Check the limit again to see if the reclaim reduced the csize = PAGE_SIZE;
* current usage of the cgroup before giving up break;
* case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
*/ goto nomem;
if (mem_cgroup_check_under_limit(mem_over_limit)) case CHARGE_NOMEM: /* OOM routine works */
continue;
/* try to avoid oom while someone is moving charge */
if (mc.moving_task && current != mc.moving_task) {
struct mem_cgroup *from, *to;
bool do_continue = false;
/*
* There is a small race that "from" or "to" can be
* freed by rmdir, so we use css_tryget().
*/
from = mc.from;
to = mc.to;
if (from && css_tryget(&from->css)) {
if (mem_over_limit->use_hierarchy)
do_continue = css_is_ancestor(
&from->css,
&mem_over_limit->css);
else
do_continue = (from == mem_over_limit);
css_put(&from->css);
}
if (!do_continue && to && css_tryget(&to->css)) {
if (mem_over_limit->use_hierarchy)
do_continue = css_is_ancestor(
&to->css,
&mem_over_limit->css);
else
do_continue = (to == mem_over_limit);
css_put(&to->css);
}
if (do_continue) {
DEFINE_WAIT(wait);
prepare_to_wait(&mc.waitq, &wait,
TASK_INTERRUPTIBLE);
/* moving charge context might have finished. */
if (mc.moving_task)
schedule();
finish_wait(&mc.waitq, &wait);
continue;
}
}
if (!nr_retries--) {
if (!oom) if (!oom)
goto nomem; goto nomem;
if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { /* If oom, we never return -ENOMEM */
nr_retries = MEM_CGROUP_RECLAIM_RETRIES; nr_oom_retries--;
continue; break;
} case CHARGE_OOM_DIE: /* Killed by OOM Killer */
/* When we reach here, current task is dying .*/
css_put(&mem->css);
goto bypass; goto bypass;
} }
} } while (ret != CHARGE_OK);
if (csize > PAGE_SIZE) if (csize > PAGE_SIZE)
refill_stock(mem, csize - PAGE_SIZE); refill_stock(mem, csize - PAGE_SIZE);
done: done:
...@@ -1724,6 +1770,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, ...@@ -1724,6 +1770,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
css_put(&mem->css); css_put(&mem->css);
return -ENOMEM; return -ENOMEM;
bypass: bypass:
if (mem)
css_put(&mem->css);
*memcg = NULL; *memcg = NULL;
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment