Commit 52ebea74 authored by Tejun Heo's avatar Tejun Heo Committed by Jens Axboe

writeback: make backing_dev_info host cgroup-specific bdi_writebacks

For the planned cgroup writeback support, on each bdi
(backing_dev_info), each memcg will be served by a separate wb
(bdi_writeback).  This patch updates bdi so that a bdi can host
multiple wbs (bdi_writebacks).

On the default hierarchy, blkcg implicitly enables memcg.  This allows
using memcg's page ownership for attributing writeback IOs, and every
memcg - blkcg combination can be served by its own wb by assigning a
dedicated wb to each memcg.  This means that there may be multiple
wb's of a bdi mapped to the same blkcg.  As congested state is per
blkcg - bdi combination, those wb's should share the same congested
state.  This is achieved by tracking congested state via
bdi_writeback_congested structs which are keyed by blkcg.

bdi->wb remains unchanged and will keep serving the root cgroup.
cgwb's (cgroup wb's) for non-root cgroups are created on-demand or
looked up while dirtying an inode according to the memcg of the page
being dirtied or current task.  Each cgwb is indexed on bdi->cgwb_tree
by its memcg id.  Once an inode is associated with its wb, it can be
retrieved using inode_to_wb().

Currently, none of the filesystems has FS_CGROUP_WRITEBACK and all
pages will keep being associated with bdi->wb.

v3: inode_attach_wb() in account_page_dirtied() moved inside
    mapping_cap_account_dirty() block where it's known to be !NULL.
    Also, an unnecessary NULL check before kfree() removed.  Both
    detected by the kbuild bot.

v2: Updated so that wb association is per inode and wb is per memcg
    rather than blkcg.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Cc: kbuild test robot <fengguang.wu@intel.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: default avatarJens Axboe <axboe@fb.com>
parent 89e9b9e0
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/err.h> #include <linux/err.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/genhd.h> #include <linux/genhd.h>
#include <linux/delay.h> #include <linux/delay.h>
...@@ -797,6 +798,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) ...@@ -797,6 +798,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
} }
spin_unlock_irq(&blkcg->lock); spin_unlock_irq(&blkcg->lock);
wb_blkcg_offline(blkcg);
} }
static void blkcg_css_free(struct cgroup_subsys_state *css) static void blkcg_css_free(struct cgroup_subsys_state *css)
...@@ -827,7 +830,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) ...@@ -827,7 +830,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
spin_lock_init(&blkcg->lock); spin_lock_init(&blkcg->lock);
INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
INIT_HLIST_HEAD(&blkcg->blkg_list); INIT_HLIST_HEAD(&blkcg->blkg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&blkcg->cgwb_list);
#endif
return &blkcg->css; return &blkcg->css;
} }
......
...@@ -185,11 +185,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) ...@@ -185,11 +185,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
*/ */
void inode_wb_list_del(struct inode *inode) void inode_wb_list_del(struct inode *inode)
{ {
struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = inode_to_wb(inode);
spin_lock(&bdi->wb.list_lock); spin_lock(&wb->list_lock);
list_del_init(&inode->i_wb_list); list_del_init(&inode->i_wb_list);
spin_unlock(&bdi->wb.list_lock); spin_unlock(&wb->list_lock);
} }
/* /*
...@@ -1268,6 +1268,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) ...@@ -1268,6 +1268,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if ((inode->i_state & flags) != flags) { if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY; const int was_dirty = inode->i_state & I_DIRTY;
inode_attach_wb(inode, NULL);
if (flags & I_DIRTY_INODE) if (flags & I_DIRTY_INODE)
inode->i_state &= ~I_DIRTY_TIME; inode->i_state &= ~I_DIRTY_TIME;
inode->i_state |= flags; inode->i_state |= flags;
......
...@@ -223,6 +223,7 @@ EXPORT_SYMBOL(free_inode_nonrcu); ...@@ -223,6 +223,7 @@ EXPORT_SYMBOL(free_inode_nonrcu);
void __destroy_inode(struct inode *inode) void __destroy_inode(struct inode *inode)
{ {
BUG_ON(inode_has_buffers(inode)); BUG_ON(inode_has_buffers(inode));
inode_detach_wb(inode);
security_inode_free(inode); security_inode_free(inode);
fsnotify_inode_delete(inode); fsnotify_inode_delete(inode);
locks_free_lock_context(inode->i_flctx); locks_free_lock_context(inode->i_flctx);
......
...@@ -2,8 +2,11 @@ ...@@ -2,8 +2,11 @@
#define __LINUX_BACKING_DEV_DEFS_H #define __LINUX_BACKING_DEV_DEFS_H
#include <linux/list.h> #include <linux/list.h>
#include <linux/radix-tree.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/percpu_counter.h> #include <linux/percpu_counter.h>
#include <linux/percpu-refcount.h>
#include <linux/flex_proportions.h> #include <linux/flex_proportions.h>
#include <linux/timer.h> #include <linux/timer.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
...@@ -37,10 +40,43 @@ enum wb_stat_item { ...@@ -37,10 +40,43 @@ enum wb_stat_item {
#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
/*
* For cgroup writeback, multiple wb's may map to the same blkcg. Those
* wb's can operate mostly independently but should share the congested
* state. To facilitate such sharing, the congested state is tracked using
* the following struct which is created on demand, indexed by blkcg ID on
* its bdi, and refcounted.
*/
struct bdi_writeback_congested { struct bdi_writeback_congested {
unsigned long state; /* WB_[a]sync_congested flags */ unsigned long state; /* WB_[a]sync_congested flags */
#ifdef CONFIG_CGROUP_WRITEBACK
struct backing_dev_info *bdi; /* the associated bdi */
atomic_t refcnt; /* nr of attached wb's and blkg */
int blkcg_id; /* ID of the associated blkcg */
struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */
#endif
}; };
/*
* Each wb (bdi_writeback) can perform writeback operations, is measured
* and throttled, independently. Without cgroup writeback, each bdi
* (bdi_writeback) is served by its embedded bdi->wb.
*
* On the default hierarchy, blkcg implicitly enables memcg. This allows
* using memcg's page ownership for attributing writeback IOs, and every
* memcg - blkcg combination can be served by its own wb by assigning a
* dedicated wb to each memcg, which enables isolation across different
* cgroups and propagation of IO back pressure down from the IO layer upto
* the tasks which are generating the dirty pages to be written back.
*
* A cgroup wb is indexed on its bdi by the ID of the associated memcg,
* refcounted with the number of inodes attached to it, and pins the memcg
* and the corresponding blkcg. As the corresponding blkcg for a memcg may
* change as blkcg is disabled and enabled higher up in the hierarchy, a wb
* is tested for blkcg after lookup and removed from index on mismatch so
* that a new wb for the combination can be created.
*/
struct bdi_writeback { struct bdi_writeback {
struct backing_dev_info *bdi; /* our parent bdi */ struct backing_dev_info *bdi; /* our parent bdi */
...@@ -78,6 +114,19 @@ struct bdi_writeback { ...@@ -78,6 +114,19 @@ struct bdi_writeback {
spinlock_t work_lock; /* protects work_list & dwork scheduling */ spinlock_t work_lock; /* protects work_list & dwork scheduling */
struct list_head work_list; struct list_head work_list;
struct delayed_work dwork; /* work item used for writeback */ struct delayed_work dwork; /* work item used for writeback */
#ifdef CONFIG_CGROUP_WRITEBACK
struct percpu_ref refcnt; /* used only for !root wb's */
struct cgroup_subsys_state *memcg_css; /* the associated memcg */
struct cgroup_subsys_state *blkcg_css; /* and blkcg */
struct list_head memcg_node; /* anchored at memcg->cgwb_list */
struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
union {
struct work_struct release_work;
struct rcu_head rcu;
};
#endif
}; };
struct backing_dev_info { struct backing_dev_info {
...@@ -92,9 +141,13 @@ struct backing_dev_info { ...@@ -92,9 +141,13 @@ struct backing_dev_info {
unsigned int min_ratio; unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac; unsigned int max_ratio, max_prop_frac;
struct bdi_writeback wb; /* default writeback info for this bdi */ struct bdi_writeback wb; /* the root writeback info for this bdi */
struct bdi_writeback_congested wb_congested; struct bdi_writeback_congested wb_congested; /* its congested state */
#ifdef CONFIG_CGROUP_WRITEBACK
struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
struct rb_root cgwb_congested_tree; /* their congested states */
atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
#endif
struct device *dev; struct device *dev;
struct timer_list laptop_mode_wb_timer; struct timer_list laptop_mode_wb_timer;
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/writeback.h> #include <linux/writeback.h>
#include <linux/blk-cgroup.h>
#include <linux/backing-dev-defs.h> #include <linux/backing-dev-defs.h>
int __must_check bdi_init(struct backing_dev_info *bdi); int __must_check bdi_init(struct backing_dev_info *bdi);
...@@ -234,6 +235,16 @@ static inline int bdi_sched_wait(void *word) ...@@ -234,6 +235,16 @@ static inline int bdi_sched_wait(void *word)
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback_congested *
wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp);
void wb_congested_put(struct bdi_writeback_congested *congested);
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css,
gfp_t gfp);
void __inode_attach_wb(struct inode *inode, struct page *page);
void wb_memcg_offline(struct mem_cgroup *memcg);
void wb_blkcg_offline(struct blkcg *blkcg);
/** /**
* inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
* @inode: inode of interest * @inode: inode of interest
...@@ -250,6 +261,135 @@ static inline bool inode_cgwb_enabled(struct inode *inode) ...@@ -250,6 +261,135 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
(inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK);
} }
/**
* wb_tryget - try to increment a wb's refcount
* @wb: bdi_writeback to get
*/
static inline bool wb_tryget(struct bdi_writeback *wb)
{
if (wb != &wb->bdi->wb)
return percpu_ref_tryget(&wb->refcnt);
return true;
}
/**
* wb_get - increment a wb's refcount
* @wb: bdi_writeback to get
*/
static inline void wb_get(struct bdi_writeback *wb)
{
if (wb != &wb->bdi->wb)
percpu_ref_get(&wb->refcnt);
}
/**
* wb_put - decrement a wb's refcount
* @wb: bdi_writeback to put
*/
static inline void wb_put(struct bdi_writeback *wb)
{
if (wb != &wb->bdi->wb)
percpu_ref_put(&wb->refcnt);
}
/**
* wb_find_current - find wb for %current on a bdi
* @bdi: bdi of interest
*
* Find the wb of @bdi which matches both the memcg and blkcg of %current.
* Must be called under rcu_read_lock() which protects the returend wb.
* NULL if not found.
*/
static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
struct cgroup_subsys_state *memcg_css;
struct bdi_writeback *wb;
memcg_css = task_css(current, memory_cgrp_id);
if (!memcg_css->parent)
return &bdi->wb;
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
/*
* %current's blkcg equals the effective blkcg of its memcg. No
* need to use the relatively expensive cgroup_get_e_css().
*/
if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
return wb;
return NULL;
}
/**
* wb_get_create_current - get or create wb for %current on a bdi
* @bdi: bdi of interest
* @gfp: allocation mask
*
* Equivalent to wb_get_create() on %current's memcg. This function is
* called from a relatively hot path and optimizes the common cases using
* wb_find_current().
*/
static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
struct bdi_writeback *wb;
rcu_read_lock();
wb = wb_find_current(bdi);
if (wb && unlikely(!wb_tryget(wb)))
wb = NULL;
rcu_read_unlock();
if (unlikely(!wb)) {
struct cgroup_subsys_state *memcg_css;
memcg_css = task_get_css(current, memory_cgrp_id);
wb = wb_get_create(bdi, memcg_css, gfp);
css_put(memcg_css);
}
return wb;
}
/**
* inode_attach_wb - associate an inode with its wb
* @inode: inode of interest
* @page: page being dirtied (may be NULL)
*
* If @inode doesn't have its wb, associate it with the wb matching the
* memcg of @page or, if @page is NULL, %current. May be called w/ or w/o
* @inode->i_lock.
*/
static inline void inode_attach_wb(struct inode *inode, struct page *page)
{
if (!inode->i_wb)
__inode_attach_wb(inode, page);
}
/**
* inode_detach_wb - disassociate an inode from its wb
* @inode: inode of interest
*
* @inode is being freed. Detach from its wb.
*/
static inline void inode_detach_wb(struct inode *inode)
{
if (inode->i_wb) {
wb_put(inode->i_wb);
inode->i_wb = NULL;
}
}
/**
* inode_to_wb - determine the wb of an inode
* @inode: inode of interest
*
* Returns the wb @inode is currently associated with.
*/
static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
{
return inode->i_wb;
}
#else /* CONFIG_CGROUP_WRITEBACK */ #else /* CONFIG_CGROUP_WRITEBACK */
static inline bool inode_cgwb_enabled(struct inode *inode) static inline bool inode_cgwb_enabled(struct inode *inode)
...@@ -257,6 +397,61 @@ static inline bool inode_cgwb_enabled(struct inode *inode) ...@@ -257,6 +397,61 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
return false; return false;
} }
static inline struct bdi_writeback_congested *
wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
{
return bdi->wb.congested;
}
static inline void wb_congested_put(struct bdi_writeback_congested *congested)
{
}
static inline bool wb_tryget(struct bdi_writeback *wb)
{
return true;
}
static inline void wb_get(struct bdi_writeback *wb)
{
}
static inline void wb_put(struct bdi_writeback *wb)
{
}
static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
return &bdi->wb;
}
static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
return &bdi->wb;
}
static inline void inode_attach_wb(struct inode *inode, struct page *page)
{
}
static inline void inode_detach_wb(struct inode *inode)
{
}
static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
{
return &inode_to_bdi(inode)->wb;
}
static inline void wb_memcg_offline(struct mem_cgroup *memcg)
{
}
static inline void wb_blkcg_offline(struct blkcg *blkcg)
{
}
#endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* CONFIG_CGROUP_WRITEBACK */
#endif /* _LINUX_BACKING_DEV_H */ #endif /* _LINUX_BACKING_DEV_H */
...@@ -53,6 +53,10 @@ struct blkcg { ...@@ -53,6 +53,10 @@ struct blkcg {
/* TODO: per-policy storage in blkcg */ /* TODO: per-policy storage in blkcg */
unsigned int cfq_weight; /* belongs to cfq */ unsigned int cfq_weight; /* belongs to cfq */
unsigned int cfq_leaf_weight; unsigned int cfq_leaf_weight;
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head cgwb_list;
#endif
}; };
struct blkg_stat { struct blkg_stat {
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#include <uapi/linux/fs.h> #include <uapi/linux/fs.h>
struct backing_dev_info; struct backing_dev_info;
struct bdi_writeback;
struct export_operations; struct export_operations;
struct hd_geometry; struct hd_geometry;
struct iovec; struct iovec;
...@@ -635,6 +636,9 @@ struct inode { ...@@ -635,6 +636,9 @@ struct inode {
struct hlist_node i_hash; struct hlist_node i_hash;
struct list_head i_wb_list; /* backing dev IO list */ struct list_head i_wb_list; /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback *i_wb; /* the associated cgroup wb */
#endif
struct list_head i_lru; /* inode LRU list */ struct list_head i_lru; /* inode LRU list */
struct list_head i_sb_list; struct list_head i_sb_list;
union { union {
......
...@@ -388,6 +388,10 @@ enum { ...@@ -388,6 +388,10 @@ enum {
OVER_LIMIT, OVER_LIMIT,
}; };
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
#endif
struct sock; struct sock;
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
void sock_update_memcg(struct sock *sk); void sock_update_memcg(struct sock *sk);
......
...@@ -368,6 +368,401 @@ static void wb_exit(struct bdi_writeback *wb) ...@@ -368,6 +368,401 @@ static void wb_exit(struct bdi_writeback *wb)
fprop_local_destroy_percpu(&wb->completions); fprop_local_destroy_percpu(&wb->completions);
} }
#ifdef CONFIG_CGROUP_WRITEBACK
#include <linux/memcontrol.h>
/*
* cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
* blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU
* protected. cgwb_release_wait is used to wait for the completion of cgwb
* releases from bdi destruction path.
*/
static DEFINE_SPINLOCK(cgwb_lock);
static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
/**
* wb_congested_get_create - get or create a wb_congested
* @bdi: associated bdi
* @blkcg_id: ID of the associated blkcg
* @gfp: allocation mask
*
* Look up the wb_congested for @blkcg_id on @bdi. If missing, create one.
* The returned wb_congested has its reference count incremented. Returns
* NULL on failure.
*/
struct bdi_writeback_congested *
wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
{
struct bdi_writeback_congested *new_congested = NULL, *congested;
struct rb_node **node, *parent;
unsigned long flags;
if (blkcg_id == 1)
return &bdi->wb_congested;
retry:
spin_lock_irqsave(&cgwb_lock, flags);
node = &bdi->cgwb_congested_tree.rb_node;
parent = NULL;
while (*node != NULL) {
parent = *node;
congested = container_of(parent, struct bdi_writeback_congested,
rb_node);
if (congested->blkcg_id < blkcg_id)
node = &parent->rb_left;
else if (congested->blkcg_id > blkcg_id)
node = &parent->rb_right;
else
goto found;
}
if (new_congested) {
/* !found and storage for new one already allocated, insert */
congested = new_congested;
new_congested = NULL;
rb_link_node(&congested->rb_node, parent, node);
rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
atomic_inc(&bdi->usage_cnt);
goto found;
}
spin_unlock_irqrestore(&cgwb_lock, flags);
/* allocate storage for new one and retry */
new_congested = kzalloc(sizeof(*new_congested), gfp);
if (!new_congested)
return NULL;
atomic_set(&new_congested->refcnt, 0);
new_congested->bdi = bdi;
new_congested->blkcg_id = blkcg_id;
goto retry;
found:
atomic_inc(&congested->refcnt);
spin_unlock_irqrestore(&cgwb_lock, flags);
kfree(new_congested);
return congested;
}
/**
* wb_congested_put - put a wb_congested
* @congested: wb_congested to put
*
* Put @congested and destroy it if the refcnt reaches zero.
*/
void wb_congested_put(struct bdi_writeback_congested *congested)
{
struct backing_dev_info *bdi = congested->bdi;
unsigned long flags;
if (congested->blkcg_id == 1)
return;
local_irq_save(flags);
if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
local_irq_restore(flags);
return;
}
rb_erase(&congested->rb_node, &congested->bdi->cgwb_congested_tree);
spin_unlock_irqrestore(&cgwb_lock, flags);
kfree(congested);
if (atomic_dec_and_test(&bdi->usage_cnt))
wake_up_all(&cgwb_release_wait);
}
static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work);
struct backing_dev_info *bdi = wb->bdi;
wb_shutdown(wb);
css_put(wb->memcg_css);
css_put(wb->blkcg_css);
wb_congested_put(wb->congested);
percpu_ref_exit(&wb->refcnt);
wb_exit(wb);
kfree_rcu(wb, rcu);
if (atomic_dec_and_test(&bdi->usage_cnt))
wake_up_all(&cgwb_release_wait);
}
static void cgwb_release(struct percpu_ref *refcnt)
{
struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
refcnt);
schedule_work(&wb->release_work);
}
static void cgwb_kill(struct bdi_writeback *wb)
{
lockdep_assert_held(&cgwb_lock);
WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
list_del(&wb->memcg_node);
list_del(&wb->blkcg_node);
percpu_ref_kill(&wb->refcnt);
}
static int cgwb_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css, gfp_t gfp)
{
struct mem_cgroup *memcg;
struct cgroup_subsys_state *blkcg_css;
struct blkcg *blkcg;
struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
struct bdi_writeback *wb;
unsigned long flags;
int ret = 0;
memcg = mem_cgroup_from_css(memcg_css);
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
blkcg = css_to_blkcg(blkcg_css);
memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
blkcg_cgwb_list = &blkcg->cgwb_list;
/* look up again under lock and discard on blkcg mismatch */
spin_lock_irqsave(&cgwb_lock, flags);
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
if (wb && wb->blkcg_css != blkcg_css) {
cgwb_kill(wb);
wb = NULL;
}
spin_unlock_irqrestore(&cgwb_lock, flags);
if (wb)
goto out_put;
/* need to create a new one */
wb = kmalloc(sizeof(*wb), gfp);
if (!wb)
return -ENOMEM;
ret = wb_init(wb, bdi, gfp);
if (ret)
goto err_free;
ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
if (ret)
goto err_wb_exit;
wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp);
if (!wb->congested)
goto err_ref_exit;
wb->memcg_css = memcg_css;
wb->blkcg_css = blkcg_css;
INIT_WORK(&wb->release_work, cgwb_release_workfn);
set_bit(WB_registered, &wb->state);
/*
* The root wb determines the registered state of the whole bdi and
* memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
* whether they're still online. Don't link @wb if any is dead.
* See wb_memcg_offline() and wb_blkcg_offline().
*/
ret = -ENODEV;
spin_lock_irqsave(&cgwb_lock, flags);
if (test_bit(WB_registered, &bdi->wb.state) &&
blkcg_cgwb_list->next && memcg_cgwb_list->next) {
/* we might have raced another instance of this function */
ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
if (!ret) {
atomic_inc(&bdi->usage_cnt);
list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list);
css_get(memcg_css);
css_get(blkcg_css);
}
}
spin_unlock_irqrestore(&cgwb_lock, flags);
if (ret) {
if (ret == -EEXIST)
ret = 0;
goto err_put_congested;
}
goto out_put;
err_put_congested:
wb_congested_put(wb->congested);
err_ref_exit:
percpu_ref_exit(&wb->refcnt);
err_wb_exit:
wb_exit(wb);
err_free:
kfree(wb);
out_put:
css_put(blkcg_css);
return ret;
}
/**
* wb_get_create - get wb for a given memcg, create if necessary
* @bdi: target bdi
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
* @gfp: allocation mask to use
*
* Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
* create one. The returned wb has its refcount incremented.
*
* This function uses css_get() on @memcg_css and thus expects its refcnt
* to be positive on invocation. IOW, rcu_read_lock() protection on
* @memcg_css isn't enough. try_get it before calling this function.
*
* A wb is keyed by its associated memcg. As blkcg implicitly enables
* memcg on the default hierarchy, memcg association is guaranteed to be
* more specific (equal or descendant to the associated blkcg) and thus can
* identify both the memcg and blkcg associations.
*
* Because the blkcg associated with a memcg may change as blkcg is enabled
* and disabled closer to root in the hierarchy, each wb keeps track of
* both the memcg and blkcg associated with it and verifies the blkcg on
* each lookup. On mismatch, the existing wb is discarded and a new one is
* created.
*/
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css,
gfp_t gfp)
{
struct bdi_writeback *wb;
might_sleep_if(gfp & __GFP_WAIT);
if (!memcg_css->parent)
return &bdi->wb;
do {
rcu_read_lock();
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
if (wb) {
struct cgroup_subsys_state *blkcg_css;
/* see whether the blkcg association has changed */
blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
&blkio_cgrp_subsys);
if (unlikely(wb->blkcg_css != blkcg_css ||
!wb_tryget(wb)))
wb = NULL;
css_put(blkcg_css);
}
rcu_read_unlock();
} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
return wb;
}
void __inode_attach_wb(struct inode *inode, struct page *page)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct bdi_writeback *wb = NULL;
if (inode_cgwb_enabled(inode)) {
struct cgroup_subsys_state *memcg_css;
if (page) {
memcg_css = mem_cgroup_css_from_page(page);
wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
} else {
/* must pin memcg_css, see wb_get_create() */
memcg_css = task_get_css(current, memory_cgrp_id);
wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
css_put(memcg_css);
}
}
if (!wb)
wb = &bdi->wb;
/*
* There may be multiple instances of this function racing to
* update the same inode. Use cmpxchg() to tell the winner.
*/
if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
wb_put(wb);
}
static void cgwb_bdi_init(struct backing_dev_info *bdi)
{
bdi->wb.memcg_css = mem_cgroup_root_css;
bdi->wb.blkcg_css = blkcg_root_css;
bdi->wb_congested.blkcg_id = 1;
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
atomic_set(&bdi->usage_cnt, 1);
}
static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
{
struct radix_tree_iter iter;
void **slot;
WARN_ON(test_bit(WB_registered, &bdi->wb.state));
spin_lock_irq(&cgwb_lock);
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
cgwb_kill(*slot);
spin_unlock_irq(&cgwb_lock);
/*
* All cgwb's and their congested states must be shutdown and
* released before returning. Drain the usage counter to wait for
* all cgwb's and cgwb_congested's ever created on @bdi.
*/
atomic_dec(&bdi->usage_cnt);
wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
}
/**
* wb_memcg_offline - kill all wb's associated with a memcg being offlined
* @memcg: memcg being offlined
*
* Also prevents creation of any new wb's associated with @memcg.
*/
void wb_memcg_offline(struct mem_cgroup *memcg)
{
LIST_HEAD(to_destroy);
struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
struct bdi_writeback *wb, *next;
spin_lock_irq(&cgwb_lock);
list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
cgwb_kill(wb);
memcg_cgwb_list->next = NULL; /* prevent new wb's */
spin_unlock_irq(&cgwb_lock);
}
/**
* wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
* @blkcg: blkcg being offlined
*
* Also prevents creation of any new wb's associated with @blkcg.
*/
void wb_blkcg_offline(struct blkcg *blkcg)
{
LIST_HEAD(to_destroy);
struct bdi_writeback *wb, *next;
spin_lock_irq(&cgwb_lock);
list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
cgwb_kill(wb);
blkcg->cgwb_list.next = NULL; /* prevent new wb's */
spin_unlock_irq(&cgwb_lock);
}
#else /* CONFIG_CGROUP_WRITEBACK */
static void cgwb_bdi_init(struct backing_dev_info *bdi) { }
static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
#endif /* CONFIG_CGROUP_WRITEBACK */
int bdi_init(struct backing_dev_info *bdi) int bdi_init(struct backing_dev_info *bdi)
{ {
int err; int err;
...@@ -386,6 +781,7 @@ int bdi_init(struct backing_dev_info *bdi) ...@@ -386,6 +781,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->wb_congested.state = 0; bdi->wb_congested.state = 0;
bdi->wb.congested = &bdi->wb_congested; bdi->wb.congested = &bdi->wb_congested;
cgwb_bdi_init(bdi);
return 0; return 0;
} }
EXPORT_SYMBOL(bdi_init); EXPORT_SYMBOL(bdi_init);
...@@ -459,6 +855,7 @@ void bdi_destroy(struct backing_dev_info *bdi) ...@@ -459,6 +855,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
/* make sure nobody finds us on the bdi_list anymore */ /* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi); bdi_remove_from_list(bdi);
wb_shutdown(&bdi->wb); wb_shutdown(&bdi->wb);
cgwb_bdi_destroy(bdi);
if (bdi->dev) { if (bdi->dev) {
bdi_debug_unregister(bdi); bdi_debug_unregister(bdi);
......
...@@ -348,6 +348,10 @@ struct mem_cgroup { ...@@ -348,6 +348,10 @@ struct mem_cgroup {
atomic_t numainfo_updating; atomic_t numainfo_updating;
#endif #endif
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head cgwb_list;
#endif
/* List of events which userspace want to receive */ /* List of events which userspace want to receive */
struct list_head event_list; struct list_head event_list;
spinlock_t event_list_lock; spinlock_t event_list_lock;
...@@ -4030,6 +4034,15 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) ...@@ -4030,6 +4034,15 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
} }
#endif #endif
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
{
return &memcg->cgwb_list;
}
#endif /* CONFIG_CGROUP_WRITEBACK */
/* /*
* DO NOT USE IN NEW FILES. * DO NOT USE IN NEW FILES.
* *
...@@ -4494,7 +4507,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ...@@ -4494,7 +4507,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
#ifdef CONFIG_MEMCG_KMEM #ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1; memcg->kmemcg_id = -1;
#endif #endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
#endif
return &memcg->css; return &memcg->css;
free_out: free_out:
...@@ -4582,6 +4597,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) ...@@ -4582,6 +4597,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
vmpressure_cleanup(&memcg->vmpressure); vmpressure_cleanup(&memcg->vmpressure);
memcg_deactivate_kmem(memcg); memcg_deactivate_kmem(memcg);
wb_memcg_offline(memcg);
} }
static void mem_cgroup_css_free(struct cgroup_subsys_state *css) static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
......
...@@ -2097,16 +2097,21 @@ int __set_page_dirty_no_writeback(struct page *page) ...@@ -2097,16 +2097,21 @@ int __set_page_dirty_no_writeback(struct page *page)
void account_page_dirtied(struct page *page, struct address_space *mapping, void account_page_dirtied(struct page *page, struct address_space *mapping,
struct mem_cgroup *memcg) struct mem_cgroup *memcg)
{ {
struct inode *inode = mapping->host;
trace_writeback_dirty_page(page, mapping); trace_writeback_dirty_page(page, mapping);
if (mapping_cap_account_dirty(mapping)) { if (mapping_cap_account_dirty(mapping)) {
struct backing_dev_info *bdi = inode_to_bdi(mapping->host); struct bdi_writeback *wb;
inode_attach_wb(inode, page);
wb = inode_to_wb(inode);
mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
__inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED); __inc_zone_page_state(page, NR_DIRTIED);
__inc_wb_stat(&bdi->wb, WB_RECLAIMABLE); __inc_wb_stat(wb, WB_RECLAIMABLE);
__inc_wb_stat(&bdi->wb, WB_DIRTIED); __inc_wb_stat(wb, WB_DIRTIED);
task_io_account_write(PAGE_CACHE_SIZE); task_io_account_write(PAGE_CACHE_SIZE);
current->nr_dirtied++; current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits); this_cpu_inc(bdp_ratelimits);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment