Commit f5fbe6b7 authored by Roman Gushchin's avatar Roman Gushchin Committed by Linus Torvalds

writeback, cgroup: support switching multiple inodes at once

Currently only a single inode can be switched to another writeback
structure at once.  That means to switch an inode a separate
inode_switch_wbs_context structure must be allocated, and a separate rcu
callback and work must be scheduled.

It's fine for the existing ad-hoc switching, which is not happening that
often, but sub-optimal for massive switching required in order to release
a writeback structure.  To prepare for it, let's add a support for
switching multiple inodes at once.

Instead of containing a single inode pointer, inode_switch_wbs_context
will contain a NULL-terminated array of inode pointers.
inode_do_switch_wbs() will be called for each inode.

To optimize the locking bdi->wb_switch_rwsem, old_wb's and new_wb's
list_locks will be acquired and released only once altogether for all
inodes.  wb_wakeup() will be also be called only once.  Instead of calling
wb_put(old_wb) after each successful switch, wb_put_many() is introduced
and used.

Link: https://lkml.kernel.org/r/20210608230225.2078447-8-guro@fb.comSigned-off-by: default avatarRoman Gushchin <guro@fb.com>
Acked-by: default avatarTejun Heo <tj@kernel.org>
Reviewed-by: default avatarJan Kara <jack@suse.cz>
Acked-by: default avatarDennis Zhou <dennis@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Jan Kara <jack@suse.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 72d4512e
...@@ -335,10 +335,18 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) ...@@ -335,10 +335,18 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
} }
struct inode_switch_wbs_context { struct inode_switch_wbs_context {
struct inode *inode;
struct bdi_writeback *new_wb;
struct rcu_work work; struct rcu_work work;
/*
* Multiple inodes can be switched at once. The switching procedure
* consists of two parts, separated by a RCU grace period. To make
* sure that the second part is executed for each inode gone through
* the first part, all inode pointers are placed into a NULL-terminated
* array embedded into struct inode_switch_wbs_context. Otherwise
* an inode could be left in a non-consistent state.
*/
struct bdi_writeback *new_wb;
struct inode *inodes[];
}; };
static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
...@@ -351,39 +359,15 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) ...@@ -351,39 +359,15 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
up_write(&bdi->wb_switch_rwsem); up_write(&bdi->wb_switch_rwsem);
} }
static void inode_do_switch_wbs(struct inode *inode, static bool inode_do_switch_wbs(struct inode *inode,
struct bdi_writeback *old_wb,
struct bdi_writeback *new_wb) struct bdi_writeback *new_wb)
{ {
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct address_space *mapping = inode->i_mapping; struct address_space *mapping = inode->i_mapping;
struct bdi_writeback *old_wb = inode->i_wb;
XA_STATE(xas, &mapping->i_pages, 0); XA_STATE(xas, &mapping->i_pages, 0);
struct page *page; struct page *page;
bool switched = false; bool switched = false;
/*
* If @inode switches cgwb membership while sync_inodes_sb() is
* being issued, sync_inodes_sb() might miss it. Synchronize.
*/
down_read(&bdi->wb_switch_rwsem);
/*
* By the time control reaches here, RCU grace period has passed
* since I_WB_SWITCH assertion and all wb stat update transactions
* between unlocked_inode_to_wb_begin/end() are guaranteed to be
* synchronizing against the i_pages lock.
*
* Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
* gives us exclusion against all wb related operations on @inode
* including IO list manipulations and stat updates.
*/
if (old_wb < new_wb) {
spin_lock(&old_wb->list_lock);
spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
} else {
spin_lock(&new_wb->list_lock);
spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
}
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
xa_lock_irq(&mapping->i_pages); xa_lock_irq(&mapping->i_pages);
...@@ -458,25 +442,63 @@ static void inode_do_switch_wbs(struct inode *inode, ...@@ -458,25 +442,63 @@ static void inode_do_switch_wbs(struct inode *inode,
xa_unlock_irq(&mapping->i_pages); xa_unlock_irq(&mapping->i_pages);
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
spin_unlock(&new_wb->list_lock);
spin_unlock(&old_wb->list_lock);
up_read(&bdi->wb_switch_rwsem);
if (switched) { return switched;
wb_wakeup(new_wb);
wb_put(old_wb);
}
} }
static void inode_switch_wbs_work_fn(struct work_struct *work) static void inode_switch_wbs_work_fn(struct work_struct *work)
{ {
struct inode_switch_wbs_context *isw = struct inode_switch_wbs_context *isw =
container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
struct bdi_writeback *new_wb = isw->new_wb;
unsigned long nr_switched = 0;
struct inode **inodep;
/*
* If @inode switches cgwb membership while sync_inodes_sb() is
* being issued, sync_inodes_sb() might miss it. Synchronize.
*/
down_read(&bdi->wb_switch_rwsem);
/*
* By the time control reaches here, RCU grace period has passed
* since I_WB_SWITCH assertion and all wb stat update transactions
* between unlocked_inode_to_wb_begin/end() are guaranteed to be
* synchronizing against the i_pages lock.
*
* Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
* gives us exclusion against all wb related operations on @inode
* including IO list manipulations and stat updates.
*/
if (old_wb < new_wb) {
spin_lock(&old_wb->list_lock);
spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
} else {
spin_lock(&new_wb->list_lock);
spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
}
for (inodep = isw->inodes; *inodep; inodep++) {
WARN_ON_ONCE((*inodep)->i_wb != old_wb);
if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
nr_switched++;
}
spin_unlock(&new_wb->list_lock);
spin_unlock(&old_wb->list_lock);
up_read(&bdi->wb_switch_rwsem);
if (nr_switched) {
wb_wakeup(new_wb);
wb_put_many(old_wb, nr_switched);
}
inode_do_switch_wbs(isw->inode, isw->new_wb); for (inodep = isw->inodes; *inodep; inodep++)
wb_put(isw->new_wb); iput(*inodep);
iput(isw->inode); wb_put(new_wb);
kfree(isw); kfree(isw);
atomic_dec(&isw_nr_in_flight); atomic_dec(&isw_nr_in_flight);
} }
...@@ -503,7 +525,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) ...@@ -503,7 +525,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT) if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
return; return;
isw = kzalloc(sizeof(*isw), GFP_ATOMIC); isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC);
if (!isw) if (!isw)
return; return;
...@@ -530,7 +552,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) ...@@ -530,7 +552,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
__iget(inode); __iget(inode);
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
isw->inode = inode; isw->inodes[0] = inode;
/* /*
* In addition to synchronizing among switchers, I_WB_SWITCH tells * In addition to synchronizing among switchers, I_WB_SWITCH tells
......
...@@ -240,8 +240,9 @@ static inline void wb_get(struct bdi_writeback *wb) ...@@ -240,8 +240,9 @@ static inline void wb_get(struct bdi_writeback *wb)
/** /**
* wb_put - decrement a wb's refcount * wb_put - decrement a wb's refcount
* @wb: bdi_writeback to put * @wb: bdi_writeback to put
* @nr: number of references to put
*/ */
static inline void wb_put(struct bdi_writeback *wb) static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
{ {
if (WARN_ON_ONCE(!wb->bdi)) { if (WARN_ON_ONCE(!wb->bdi)) {
/* /*
...@@ -252,7 +253,16 @@ static inline void wb_put(struct bdi_writeback *wb) ...@@ -252,7 +253,16 @@ static inline void wb_put(struct bdi_writeback *wb)
} }
if (wb != &wb->bdi->wb) if (wb != &wb->bdi->wb)
percpu_ref_put(&wb->refcnt); percpu_ref_put_many(&wb->refcnt, nr);
}
/**
* wb_put - decrement a wb's refcount
* @wb: bdi_writeback to put
*/
static inline void wb_put(struct bdi_writeback *wb)
{
wb_put_many(wb, 1);
} }
/** /**
...@@ -281,6 +291,10 @@ static inline void wb_put(struct bdi_writeback *wb) ...@@ -281,6 +291,10 @@ static inline void wb_put(struct bdi_writeback *wb)
{ {
} }
static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
{
}
static inline bool wb_dying(struct bdi_writeback *wb) static inline bool wb_dying(struct bdi_writeback *wb)
{ {
return false; return false;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment