Commit 52ebea74 authored by Tejun Heo's avatar Tejun Heo Committed by Jens Axboe

writeback: make backing_dev_info host cgroup-specific bdi_writebacks

For the planned cgroup writeback support, on each bdi
(backing_dev_info), each memcg will be served by a separate wb
(bdi_writeback).  This patch updates bdi so that a bdi can host
multiple wbs (bdi_writebacks).

On the default hierarchy, blkcg implicitly enables memcg.  This allows
using memcg's page ownership for attributing writeback IOs, and every
memcg - blkcg combination can be served by its own wb by assigning a
dedicated wb to each memcg.  This means that there may be multiple
wb's of a bdi mapped to the same blkcg.  As congested state is per
blkcg - bdi combination, those wb's should share the same congested
state.  This is achieved by tracking congested state via
bdi_writeback_congested structs which are keyed by blkcg.

bdi->wb remains unchanged and will keep serving the root cgroup.
cgwb's (cgroup wb's) for non-root cgroups are created on-demand or
looked up while dirtying an inode according to the memcg of the page
being dirtied or current task.  Each cgwb is indexed on bdi->cgwb_tree
by its memcg id.  Once an inode is associated with its wb, it can be
retrieved using inode_to_wb().

Currently, none of the filesystems has FS_CGROUP_WRITEBACK and all
pages will keep being associated with bdi->wb.

v3: inode_attach_wb() in account_page_dirtied() moved inside
    mapping_cap_account_dirty() block where it's known to be !NULL.
    Also, an unnecessary NULL check before kfree() removed.  Both
    detected by the kbuild bot.

v2: Updated so that wb association is per inode and wb is per memcg
    rather than blkcg.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Cc: kbuild test robot <fengguang.wu@intel.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: default avatarJens Axboe <axboe@fb.com>
parent 89e9b9e0
......@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/err.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/slab.h>
#include <linux/genhd.h>
#include <linux/delay.h>
......@@ -797,6 +798,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock_irq(&blkcg->lock);
wb_blkcg_offline(blkcg);
}
static void blkcg_css_free(struct cgroup_subsys_state *css)
......@@ -827,7 +830,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
spin_lock_init(&blkcg->lock);
INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
INIT_HLIST_HEAD(&blkcg->blkg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&blkcg->cgwb_list);
#endif
return &blkcg->css;
}
......
......@@ -185,11 +185,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
*/
void inode_wb_list_del(struct inode *inode)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct bdi_writeback *wb = inode_to_wb(inode);
spin_lock(&bdi->wb.list_lock);
spin_lock(&wb->list_lock);
list_del_init(&inode->i_wb_list);
spin_unlock(&bdi->wb.list_lock);
spin_unlock(&wb->list_lock);
}
/*
......@@ -1268,6 +1268,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;
inode_attach_wb(inode, NULL);
if (flags & I_DIRTY_INODE)
inode->i_state &= ~I_DIRTY_TIME;
inode->i_state |= flags;
......
......@@ -223,6 +223,7 @@ EXPORT_SYMBOL(free_inode_nonrcu);
void __destroy_inode(struct inode *inode)
{
BUG_ON(inode_has_buffers(inode));
inode_detach_wb(inode);
security_inode_free(inode);
fsnotify_inode_delete(inode);
locks_free_lock_context(inode->i_flctx);
......
......@@ -2,8 +2,11 @@
#define __LINUX_BACKING_DEV_DEFS_H
#include <linux/list.h>
#include <linux/radix-tree.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/percpu_counter.h>
#include <linux/percpu-refcount.h>
#include <linux/flex_proportions.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
......@@ -37,10 +40,43 @@ enum wb_stat_item {
#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
/*
* For cgroup writeback, multiple wb's may map to the same blkcg. Those
* wb's can operate mostly independently but should share the congested
* state. To facilitate such sharing, the congested state is tracked using
* the following struct which is created on demand, indexed by blkcg ID on
* its bdi, and refcounted.
*/
struct bdi_writeback_congested {
unsigned long state; /* WB_[a]sync_congested flags */
#ifdef CONFIG_CGROUP_WRITEBACK
struct backing_dev_info *bdi; /* the associated bdi */
atomic_t refcnt; /* nr of attached wb's and blkg */
int blkcg_id; /* ID of the associated blkcg */
struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */
#endif
};
/*
* Each wb (bdi_writeback) can perform writeback operations, is measured
* and throttled, independently. Without cgroup writeback, each bdi
* (bdi_writeback) is served by its embedded bdi->wb.
*
* On the default hierarchy, blkcg implicitly enables memcg. This allows
* using memcg's page ownership for attributing writeback IOs, and every
* memcg - blkcg combination can be served by its own wb by assigning a
* dedicated wb to each memcg, which enables isolation across different
* cgroups and propagation of IO back pressure down from the IO layer upto
* the tasks which are generating the dirty pages to be written back.
*
* A cgroup wb is indexed on its bdi by the ID of the associated memcg,
* refcounted with the number of inodes attached to it, and pins the memcg
* and the corresponding blkcg. As the corresponding blkcg for a memcg may
* change as blkcg is disabled and enabled higher up in the hierarchy, a wb
* is tested for blkcg after lookup and removed from index on mismatch so
* that a new wb for the combination can be created.
*/
struct bdi_writeback {
struct backing_dev_info *bdi; /* our parent bdi */
......@@ -78,6 +114,19 @@ struct bdi_writeback {
spinlock_t work_lock; /* protects work_list & dwork scheduling */
struct list_head work_list;
struct delayed_work dwork; /* work item used for writeback */
#ifdef CONFIG_CGROUP_WRITEBACK
struct percpu_ref refcnt; /* used only for !root wb's */
struct cgroup_subsys_state *memcg_css; /* the associated memcg */
struct cgroup_subsys_state *blkcg_css; /* and blkcg */
struct list_head memcg_node; /* anchored at memcg->cgwb_list */
struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
union {
struct work_struct release_work;
struct rcu_head rcu;
};
#endif
};
struct backing_dev_info {
......@@ -92,9 +141,13 @@ struct backing_dev_info {
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;
struct bdi_writeback wb; /* default writeback info for this bdi */
struct bdi_writeback_congested wb_congested;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct bdi_writeback_congested wb_congested; /* its congested state */
#ifdef CONFIG_CGROUP_WRITEBACK
struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
struct rb_root cgwb_congested_tree; /* their congested states */
atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
#endif
struct device *dev;
struct timer_list laptop_mode_wb_timer;
......
......@@ -13,6 +13,7 @@
#include <linux/sched.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
#include <linux/blk-cgroup.h>
#include <linux/backing-dev-defs.h>
int __must_check bdi_init(struct backing_dev_info *bdi);
......@@ -234,6 +235,16 @@ static inline int bdi_sched_wait(void *word)
#ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback_congested *
wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp);
void wb_congested_put(struct bdi_writeback_congested *congested);
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css,
gfp_t gfp);
void __inode_attach_wb(struct inode *inode, struct page *page);
void wb_memcg_offline(struct mem_cgroup *memcg);
void wb_blkcg_offline(struct blkcg *blkcg);
/**
* inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
* @inode: inode of interest
......@@ -250,6 +261,135 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
(inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK);
}
/**
* wb_tryget - try to increment a wb's refcount
* @wb: bdi_writeback to get
*/
static inline bool wb_tryget(struct bdi_writeback *wb)
{
if (wb != &wb->bdi->wb)
return percpu_ref_tryget(&wb->refcnt);
return true;
}
/**
* wb_get - increment a wb's refcount
* @wb: bdi_writeback to get
*/
static inline void wb_get(struct bdi_writeback *wb)
{
if (wb != &wb->bdi->wb)
percpu_ref_get(&wb->refcnt);
}
/**
* wb_put - decrement a wb's refcount
* @wb: bdi_writeback to put
*/
static inline void wb_put(struct bdi_writeback *wb)
{
if (wb != &wb->bdi->wb)
percpu_ref_put(&wb->refcnt);
}
/**
* wb_find_current - find wb for %current on a bdi
* @bdi: bdi of interest
*
* Find the wb of @bdi which matches both the memcg and blkcg of %current.
* Must be called under rcu_read_lock() which protects the returend wb.
* NULL if not found.
*/
static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
struct cgroup_subsys_state *memcg_css;
struct bdi_writeback *wb;
memcg_css = task_css(current, memory_cgrp_id);
if (!memcg_css->parent)
return &bdi->wb;
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
/*
* %current's blkcg equals the effective blkcg of its memcg. No
* need to use the relatively expensive cgroup_get_e_css().
*/
if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
return wb;
return NULL;
}
/**
* wb_get_create_current - get or create wb for %current on a bdi
* @bdi: bdi of interest
* @gfp: allocation mask
*
* Equivalent to wb_get_create() on %current's memcg. This function is
* called from a relatively hot path and optimizes the common cases using
* wb_find_current().
*/
static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
struct bdi_writeback *wb;
rcu_read_lock();
wb = wb_find_current(bdi);
if (wb && unlikely(!wb_tryget(wb)))
wb = NULL;
rcu_read_unlock();
if (unlikely(!wb)) {
struct cgroup_subsys_state *memcg_css;
memcg_css = task_get_css(current, memory_cgrp_id);
wb = wb_get_create(bdi, memcg_css, gfp);
css_put(memcg_css);
}
return wb;
}
/**
* inode_attach_wb - associate an inode with its wb
* @inode: inode of interest
* @page: page being dirtied (may be NULL)
*
* If @inode doesn't have its wb, associate it with the wb matching the
* memcg of @page or, if @page is NULL, %current. May be called w/ or w/o
* @inode->i_lock.
*/
static inline void inode_attach_wb(struct inode *inode, struct page *page)
{
if (!inode->i_wb)
__inode_attach_wb(inode, page);
}
/**
* inode_detach_wb - disassociate an inode from its wb
* @inode: inode of interest
*
* @inode is being freed. Detach from its wb.
*/
static inline void inode_detach_wb(struct inode *inode)
{
if (inode->i_wb) {
wb_put(inode->i_wb);
inode->i_wb = NULL;
}
}
/**
* inode_to_wb - determine the wb of an inode
* @inode: inode of interest
*
* Returns the wb @inode is currently associated with.
*/
static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
{
return inode->i_wb;
}
#else /* CONFIG_CGROUP_WRITEBACK */
static inline bool inode_cgwb_enabled(struct inode *inode)
......@@ -257,6 +397,61 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
return false;
}
static inline struct bdi_writeback_congested *
wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
{
return bdi->wb.congested;
}
static inline void wb_congested_put(struct bdi_writeback_congested *congested)
{
}
static inline bool wb_tryget(struct bdi_writeback *wb)
{
return true;
}
static inline void wb_get(struct bdi_writeback *wb)
{
}
static inline void wb_put(struct bdi_writeback *wb)
{
}
static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
return &bdi->wb;
}
static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
return &bdi->wb;
}
static inline void inode_attach_wb(struct inode *inode, struct page *page)
{
}
static inline void inode_detach_wb(struct inode *inode)
{
}
static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
{
return &inode_to_bdi(inode)->wb;
}
static inline void wb_memcg_offline(struct mem_cgroup *memcg)
{
}
static inline void wb_blkcg_offline(struct blkcg *blkcg)
{
}
#endif /* CONFIG_CGROUP_WRITEBACK */
#endif /* _LINUX_BACKING_DEV_H */
......@@ -53,6 +53,10 @@ struct blkcg {
/* TODO: per-policy storage in blkcg */
unsigned int cfq_weight; /* belongs to cfq */
unsigned int cfq_leaf_weight;
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head cgwb_list;
#endif
};
struct blkg_stat {
......
......@@ -35,6 +35,7 @@
#include <uapi/linux/fs.h>
struct backing_dev_info;
struct bdi_writeback;
struct export_operations;
struct hd_geometry;
struct iovec;
......@@ -635,6 +636,9 @@ struct inode {
struct hlist_node i_hash;
struct list_head i_wb_list; /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback *i_wb; /* the associated cgroup wb */
#endif
struct list_head i_lru; /* inode LRU list */
struct list_head i_sb_list;
union {
......
......@@ -388,6 +388,10 @@ enum {
OVER_LIMIT,
};
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
#endif
struct sock;
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
void sock_update_memcg(struct sock *sk);
......
This diff is collapsed.
......@@ -348,6 +348,10 @@ struct mem_cgroup {
atomic_t numainfo_updating;
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head cgwb_list;
#endif
/* List of events which userspace want to receive */
struct list_head event_list;
spinlock_t event_list_lock;
......@@ -4030,6 +4034,15 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
}
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
{
return &memcg->cgwb_list;
}
#endif /* CONFIG_CGROUP_WRITEBACK */
/*
* DO NOT USE IN NEW FILES.
*
......@@ -4494,7 +4507,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
#endif
return &memcg->css;
free_out:
......@@ -4582,6 +4597,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
vmpressure_cleanup(&memcg->vmpressure);
memcg_deactivate_kmem(memcg);
wb_memcg_offline(memcg);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
......
......@@ -2097,16 +2097,21 @@ int __set_page_dirty_no_writeback(struct page *page)
void account_page_dirtied(struct page *page, struct address_space *mapping,
struct mem_cgroup *memcg)
{
struct inode *inode = mapping->host;
trace_writeback_dirty_page(page, mapping);
if (mapping_cap_account_dirty(mapping)) {
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
struct bdi_writeback *wb;
inode_attach_wb(inode, page);
wb = inode_to_wb(inode);
mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
__inc_wb_stat(&bdi->wb, WB_RECLAIMABLE);
__inc_wb_stat(&bdi->wb, WB_DIRTIED);
__inc_wb_stat(wb, WB_RECLAIMABLE);
__inc_wb_stat(wb, WB_DIRTIED);
task_io_account_write(PAGE_CACHE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment