Commit 02491447 authored by Daisuke Nishimura's avatar Daisuke Nishimura Committed by Linus Torvalds

memcg: move charges of anonymous swap

This patch is another core part of this move-charge-at-task-migration
feature.  It enables moving charges of anonymous swaps.

To move the charge of swap, we need to exchange swap_cgroup's record.

In current implementation, swap_cgroup's record is protected by:

  - page lock: if the entry is on swap cache.
  - swap_lock: if the entry is not on swap cache.

This works well in usual swap-in/out activity.

But this behavior make the feature of moving swap charge check many
conditions to exchange swap_cgroup's record safely.

So I changed modification of swap_cgroup's recored(swap_cgroup_record())
to use xchg, and define a new function to cmpxchg swap_cgroup's record.

This patch also enables moving charge of non pte_present but not uncharged
swap caches, which can be exist on swap-out path, by getting the target
pages via find_get_page() as do_mincore() does.

[kosaki.motohiro@jp.fujitsu.com: fix ia64 build]
[akpm@linux-foundation.org: fix typos]
Signed-off-by: default avatarDaisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: default avatarKOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 8033b97c
...@@ -420,6 +420,8 @@ NOTE2: It is recommended to set the soft limit always below the hard limit, ...@@ -420,6 +420,8 @@ NOTE2: It is recommended to set the soft limit always below the hard limit,
Users can move charges associated with a task along with task migration, that Users can move charges associated with a task along with task migration, that
is, uncharge task's pages from the old cgroup and charge them to the new cgroup. is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
This feature is not supported in !CONFIG_MMU environments because of lack of
page tables.
8.1 Interface 8.1 Interface
......
...@@ -118,6 +118,8 @@ static inline void __init page_cgroup_init_flatmem(void) ...@@ -118,6 +118,8 @@ static inline void __init page_cgroup_init_flatmem(void)
#include <linux/swap.h> #include <linux/swap.h>
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
unsigned short old, unsigned short new);
extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
extern unsigned short lookup_swap_cgroup(swp_entry_t ent); extern unsigned short lookup_swap_cgroup(swp_entry_t ent);
extern int swap_cgroup_swapon(int type, unsigned long max_pages); extern int swap_cgroup_swapon(int type, unsigned long max_pages);
......
...@@ -355,6 +355,7 @@ static inline void disable_swap_token(void) ...@@ -355,6 +355,7 @@ static inline void disable_swap_token(void)
#ifdef CONFIG_CGROUP_MEM_RES_CTLR #ifdef CONFIG_CGROUP_MEM_RES_CTLR
extern void extern void
mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout); mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
extern int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep);
#else #else
static inline void static inline void
mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
...@@ -485,6 +486,14 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) ...@@ -485,6 +486,14 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
{ {
} }
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
static inline int
mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
{
return 0;
}
#endif
#endif /* CONFIG_SWAP */ #endif /* CONFIG_SWAP */
#endif /* __KERNEL__*/ #endif /* __KERNEL__*/
#endif /* _LINUX_SWAP_H */ #endif /* _LINUX_SWAP_H */
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include <linux/rbtree.h> #include <linux/rbtree.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/swap.h> #include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
...@@ -2270,6 +2271,54 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) ...@@ -2270,6 +2271,54 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
} }
rcu_read_unlock(); rcu_read_unlock();
} }
/**
* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
* @entry: swap entry to be moved
* @from: mem_cgroup which the entry is moved from
* @to: mem_cgroup which the entry is moved to
*
* It succeeds only when the swap_cgroup's record for this entry is the same
* as the mem_cgroup's id of @from.
*
* Returns 0 on success, -EINVAL on failure.
*
* The caller must have charged to @to, IOW, called res_counter_charge() about
* both res and memsw, and called css_get().
*/
static int mem_cgroup_move_swap_account(swp_entry_t entry,
struct mem_cgroup *from, struct mem_cgroup *to)
{
unsigned short old_id, new_id;
old_id = css_id(&from->css);
new_id = css_id(&to->css);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
if (!mem_cgroup_is_root(from))
res_counter_uncharge(&from->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(from, false);
mem_cgroup_put(from);
/*
* we charged both to->res and to->memsw, so we should uncharge
* to->res.
*/
if (!mem_cgroup_is_root(to))
res_counter_uncharge(&to->res, PAGE_SIZE);
mem_cgroup_swap_statistics(to, true);
mem_cgroup_get(to);
css_put(&to->css);
return 0;
}
return -EINVAL;
}
#else
static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
struct mem_cgroup *from, struct mem_cgroup *to)
{
return -EINVAL;
}
#endif #endif
/* /*
...@@ -2949,6 +2998,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, ...@@ -2949,6 +2998,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
} }
#ifdef CONFIG_MMU
static int mem_cgroup_move_charge_write(struct cgroup *cgrp, static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
struct cftype *cft, u64 val) struct cftype *cft, u64 val)
{ {
...@@ -2967,6 +3017,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, ...@@ -2967,6 +3017,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
return 0; return 0;
} }
#else
static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
return -ENOSYS;
}
#endif
/* For read statistics */ /* For read statistics */
...@@ -3489,6 +3546,7 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, ...@@ -3489,6 +3546,7 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
return ret; return ret;
} }
#ifdef CONFIG_MMU
/* Handlers for move charge at task migration. */ /* Handlers for move charge at task migration. */
#define PRECHARGE_COUNT_AT_ONCE 256 #define PRECHARGE_COUNT_AT_ONCE 256
static int mem_cgroup_do_precharge(unsigned long count) static int mem_cgroup_do_precharge(unsigned long count)
...@@ -3544,77 +3602,124 @@ static int mem_cgroup_do_precharge(unsigned long count) ...@@ -3544,77 +3602,124 @@ static int mem_cgroup_do_precharge(unsigned long count)
} }
return ret; return ret;
} }
#else /* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
struct task_struct *p,
bool threadgroup)
{
return 0;
}
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
struct task_struct *p,
bool threadgroup)
{
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
struct task_struct *p,
bool threadgroup)
{
}
#endif
/** /**
* is_target_pte_for_mc - check a pte whether it is valid for move charge * is_target_pte_for_mc - check a pte whether it is valid for move charge
* @vma: the vma the pte to be checked belongs * @vma: the vma the pte to be checked belongs
* @addr: the address corresponding to the pte to be checked * @addr: the address corresponding to the pte to be checked
* @ptent: the pte to be checked * @ptent: the pte to be checked
* @target: the pointer the target page will be stored(can be NULL) * @target: the pointer the target page or swap ent will be stored(can be NULL)
* *
* Returns * Returns
* 0(MC_TARGET_NONE): if the pte is not a target for move charge. * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
* move charge. if @target is not NULL, the page is stored in target->page * move charge. if @target is not NULL, the page is stored in target->page
* with extra refcnt got(Callers should handle it). * with extra refcnt got(Callers should handle it).
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
* *
* Called with pte lock held. * Called with pte lock held.
*/ */
/* We add a new member later. */
union mc_target { union mc_target {
struct page *page; struct page *page;
swp_entry_t ent;
}; };
/* We add a new type later. */
enum mc_target_type { enum mc_target_type {
MC_TARGET_NONE, /* not used */ MC_TARGET_NONE, /* not used */
MC_TARGET_PAGE, MC_TARGET_PAGE,
MC_TARGET_SWAP,
}; };
static int is_target_pte_for_mc(struct vm_area_struct *vma, static int is_target_pte_for_mc(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target) unsigned long addr, pte_t ptent, union mc_target *target)
{ {
struct page *page; struct page *page = NULL;
struct page_cgroup *pc; struct page_cgroup *pc;
int ret = 0; int ret = 0;
swp_entry_t ent = { .val = 0 };
int usage_count = 0;
bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON,
&mc.to->move_charge_at_immigrate); &mc.to->move_charge_at_immigrate);
if (!pte_present(ptent)) if (!pte_present(ptent)) {
return 0; /* TODO: handle swap of shmes/tmpfs */
if (pte_none(ptent) || pte_file(ptent))
page = vm_normal_page(vma, addr, ptent); return 0;
if (!page || !page_mapped(page)) else if (is_swap_pte(ptent)) {
return 0; ent = pte_to_swp_entry(ptent);
/* if (!move_anon || non_swap_entry(ent))
* TODO: We don't move charges of file(including shmem/tmpfs) pages for return 0;
* now. usage_count = mem_cgroup_count_swap_user(ent, &page);
*/ }
if (!move_anon || !PageAnon(page)) } else {
return 0; page = vm_normal_page(vma, addr, ptent);
/* if (!page || !page_mapped(page))
* TODO: We don't move charges of shared(used by multiple processes) return 0;
* pages for now. /*
*/ * TODO: We don't move charges of file(including shmem/tmpfs)
if (page_mapcount(page) > 1) * pages for now.
return 0; */
if (!get_page_unless_zero(page)) if (!move_anon || !PageAnon(page))
return 0;
if (!get_page_unless_zero(page))
return 0;
usage_count = page_mapcount(page);
}
if (usage_count > 1) {
/*
* TODO: We don't move charges of shared(used by multiple
* processes) pages for now.
*/
if (page)
put_page(page);
return 0; return 0;
}
pc = lookup_page_cgroup(page); if (page) {
/* pc = lookup_page_cgroup(page);
* Do only loose check w/o page_cgroup lock. mem_cgroup_move_account() /*
* checks the pc is valid or not under the lock. * Do only loose check w/o page_cgroup lock.
*/ * mem_cgroup_move_account() checks the pc is valid or not under
if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { * the lock.
ret = MC_TARGET_PAGE; */
if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
if (target)
target->page = page;
}
if (!ret || !target)
put_page(page);
}
/* throught */
if (ent.val && do_swap_account && !ret &&
css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
ret = MC_TARGET_SWAP;
if (target) if (target)
target->page = page; target->ent = ent;
} }
if (!ret || !target)
put_page(page);
return ret; return ret;
} }
...@@ -3754,6 +3859,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, ...@@ -3754,6 +3859,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
int type; int type;
struct page *page; struct page *page;
struct page_cgroup *pc; struct page_cgroup *pc;
swp_entry_t ent;
if (!mc.precharge) if (!mc.precharge)
break; break;
...@@ -3775,6 +3881,11 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, ...@@ -3775,6 +3881,11 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
put: /* is_target_pte_for_mc() gets the page */ put: /* is_target_pte_for_mc() gets the page */
put_page(page); put_page(page);
break; break;
case MC_TARGET_SWAP:
ent = target.ent;
if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to))
mc.precharge--;
break;
default: default:
break; break;
} }
......
...@@ -334,6 +334,37 @@ static int swap_cgroup_prepare(int type) ...@@ -334,6 +334,37 @@ static int swap_cgroup_prepare(int type)
return -ENOMEM; return -ENOMEM;
} }
/**
* swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
* @end: swap entry to be cmpxchged
* @old: old id
* @new: new id
*
* Returns old id at success, 0 at failure.
* (There is no mem_cgroup useing 0 as its id)
*/
unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
unsigned short old, unsigned short new)
{
int type = swp_type(ent);
unsigned long offset = swp_offset(ent);
unsigned long idx = offset / SC_PER_PAGE;
unsigned long pos = offset & SC_POS_MASK;
struct swap_cgroup_ctrl *ctrl;
struct page *mappage;
struct swap_cgroup *sc;
ctrl = &swap_cgroup_ctrl[type];
mappage = ctrl->map[idx];
sc = page_address(mappage);
sc += pos;
if (cmpxchg(&sc->id, old, new) == old)
return old;
else
return 0;
}
/** /**
* swap_cgroup_record - record mem_cgroup for this swp_entry. * swap_cgroup_record - record mem_cgroup for this swp_entry.
* @ent: swap entry to be recorded into * @ent: swap entry to be recorded into
...@@ -358,8 +389,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) ...@@ -358,8 +389,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
mappage = ctrl->map[idx]; mappage = ctrl->map[idx];
sc = page_address(mappage); sc = page_address(mappage);
sc += pos; sc += pos;
old = sc->id; old = xchg(&sc->id, id);
sc->id = id;
return old; return old;
} }
......
...@@ -723,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry) ...@@ -723,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry)
return p != NULL; return p != NULL;
} }
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
/**
* mem_cgroup_count_swap_user - count the user of a swap entry
* @ent: the swap entry to be checked
* @pagep: the pointer for the swap cache page of the entry to be stored
*
* Returns the number of the user of the swap entry. The number is valid only
* for swaps of anonymous pages.
* If the entry is found on swap cache, the page is stored to pagep with
* refcount of it being incremented.
*/
int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
{
struct page *page;
struct swap_info_struct *p;
int count = 0;
page = find_get_page(&swapper_space, ent.val);
if (page)
count += page_mapcount(page);
p = swap_info_get(ent);
if (p) {
count += swap_count(p->swap_map[swp_offset(ent)]);
spin_unlock(&swap_lock);
}
*pagep = page;
return count;
}
#endif
#ifdef CONFIG_HIBERNATION #ifdef CONFIG_HIBERNATION
/* /*
* Find the swap type that corresponds to given device (if any). * Find the swap type that corresponds to given device (if any).
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment