Commit 95b88300 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] writer throttling fix

The patch fixes a few problems in the writer throttling code.  Mainly
in the situation where a single large file is being written out.

That file could be parked on sb->locked_inodes due to pdflush
writeback, and the writer throttling path coming out of
balance_dirty_pages() forgot to look for inodes on ->locked_inodes.

The net effect was that the amount of dirty memory was exceeding the
limit set in /proc/sys/vm/dirty_async_ratio, possibly to the point
where the system gets seriously choked.

The patch removes sb->locked_inodes altogether and teaches the
throttling code to look for inodes on sb->s_io as well as sb->s_dirty.

Also, just leave unwritten dirty pages on mapping->io_pages, and
unwritten dirty inodes on sb->s_io.  Putting them back onto
->dirty_pages and ->dirty_inodes was fairly pointless, given that both
lists need to be looked at.
parent 0d8b3b44
...@@ -134,8 +134,6 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write) ...@@ -134,8 +134,6 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
struct address_space *mapping = inode->i_mapping; struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb; struct super_block *sb = inode->i_sb;
list_move(&inode->i_list, &sb->s_locked_inodes);
BUG_ON(inode->i_state & I_LOCK); BUG_ON(inode->i_state & I_LOCK);
/* Set I_LOCK, reset I_DIRTY */ /* Set I_LOCK, reset I_DIRTY */
...@@ -163,12 +161,12 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write) ...@@ -163,12 +161,12 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
if (inode->i_state & I_DIRTY) { /* Redirtied */ if (inode->i_state & I_DIRTY) { /* Redirtied */
list_add(&inode->i_list, &sb->s_dirty); list_add(&inode->i_list, &sb->s_dirty);
} else { } else {
if (!list_empty(&mapping->dirty_pages)) { if (!list_empty(&mapping->dirty_pages) ||
!list_empty(&mapping->io_pages)) {
/* Not a whole-file writeback */ /* Not a whole-file writeback */
mapping->dirtied_when = orig_dirtied_when; mapping->dirtied_when = orig_dirtied_when;
inode->i_state |= I_DIRTY_PAGES; inode->i_state |= I_DIRTY_PAGES;
list_add_tail(&inode->i_list, list_add_tail(&inode->i_list, &sb->s_dirty);
&sb->s_dirty);
} else if (atomic_read(&inode->i_count)) { } else if (atomic_read(&inode->i_count)) {
list_add(&inode->i_list, &inode_in_use); list_add(&inode->i_list, &inode_in_use);
} else { } else {
...@@ -205,7 +203,7 @@ __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write) ...@@ -205,7 +203,7 @@ __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
* If older_than_this is non-NULL, then only write out mappings which * If older_than_this is non-NULL, then only write out mappings which
* had their first dirtying at a time earlier than *older_than_this. * had their first dirtying at a time earlier than *older_than_this.
* *
* If we're a pdlfush thread, then implement pdlfush collision avoidance * If we're a pdlfush thread, then implement pdflush collision avoidance
* against the entire list. * against the entire list.
* *
* WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
...@@ -221,6 +219,11 @@ __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write) ...@@ -221,6 +219,11 @@ __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
* FIXME: this linear search could get expensive with many fileystems. But * FIXME: this linear search could get expensive with many fileystems. But
* how to fix? We need to go from an address_space to all inodes which share * how to fix? We need to go from an address_space to all inodes which share
* a queue with that address_space. * a queue with that address_space.
*
* The inodes to be written are parked on sb->s_io. They are moved back onto
* sb->s_dirty as they are selected for writing. This way, none can be missed
* on the writer throttling path, and we get decent balancing between many
* thrlttled threads: we don't want them all piling up on __wait_on_inode.
*/ */
static void static void
sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb, sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
...@@ -241,7 +244,7 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb, ...@@ -241,7 +244,7 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
if (single_bdi && mapping->backing_dev_info != single_bdi) { if (single_bdi && mapping->backing_dev_info != single_bdi) {
if (sb != blockdev_superblock) if (sb != blockdev_superblock)
break; /* inappropriate superblock */ break; /* inappropriate superblock */
list_move(&inode->i_list, &inode->i_sb->s_dirty); list_move(&inode->i_list, &sb->s_dirty);
continue; /* not this blockdev */ continue; /* not this blockdev */
} }
...@@ -263,10 +266,11 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb, ...@@ -263,10 +266,11 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
BUG_ON(inode->i_state & I_FREEING); BUG_ON(inode->i_state & I_FREEING);
__iget(inode); __iget(inode);
list_move(&inode->i_list, &sb->s_dirty);
__writeback_single_inode(inode, really_sync, nr_to_write); __writeback_single_inode(inode, really_sync, nr_to_write);
if (sync_mode == WB_SYNC_HOLD) { if (sync_mode == WB_SYNC_HOLD) {
mapping->dirtied_when = jiffies; mapping->dirtied_when = jiffies;
list_move(&inode->i_list, &inode->i_sb->s_dirty); list_move(&inode->i_list, &sb->s_dirty);
} }
if (current_is_pdflush()) if (current_is_pdflush())
writeback_release(bdi); writeback_release(bdi);
...@@ -278,9 +282,8 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb, ...@@ -278,9 +282,8 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
} }
out: out:
/* /*
* Put the rest back, in the correct order. * Leave any unwritten inodes on s_io.
*/ */
list_splice_init(&sb->s_io, sb->s_dirty.prev);
return; return;
} }
...@@ -302,7 +305,7 @@ __writeback_unlocked_inodes(struct backing_dev_info *bdi, int *nr_to_write, ...@@ -302,7 +305,7 @@ __writeback_unlocked_inodes(struct backing_dev_info *bdi, int *nr_to_write,
spin_lock(&sb_lock); spin_lock(&sb_lock);
sb = sb_entry(super_blocks.prev); sb = sb_entry(super_blocks.prev);
for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) {
if (!list_empty(&sb->s_dirty)) { if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io)) {
spin_unlock(&sb_lock); spin_unlock(&sb_lock);
sync_sb_inodes(bdi, sb, sync_mode, nr_to_write, sync_sb_inodes(bdi, sb, sync_mode, nr_to_write,
older_than_this); older_than_this);
...@@ -321,7 +324,7 @@ __writeback_unlocked_inodes(struct backing_dev_info *bdi, int *nr_to_write, ...@@ -321,7 +324,7 @@ __writeback_unlocked_inodes(struct backing_dev_info *bdi, int *nr_to_write,
* Note: * Note:
* We don't need to grab a reference to superblock here. If it has non-empty * We don't need to grab a reference to superblock here. If it has non-empty
* ->s_dirty it's hadn't been killed yet and kill_super() won't proceed * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
* past sync_inodes_sb() until both ->s_dirty and ->s_locked_inodes are * past sync_inodes_sb() until both the ->s_dirty and ->s_io lists are
* empty. Since __sync_single_inode() regains inode_lock before it finally moves * empty. Since __sync_single_inode() regains inode_lock before it finally moves
* inode from superblock lists we are OK. * inode from superblock lists we are OK.
* *
...@@ -352,19 +355,6 @@ void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write, ...@@ -352,19 +355,6 @@ void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write,
sync_mode, older_than_this); sync_mode, older_than_this);
} }
static void __wait_on_locked(struct list_head *head)
{
struct list_head * tmp;
while ((tmp = head->prev) != head) {
struct inode *inode = list_entry(tmp, struct inode, i_list);
__iget(inode);
spin_unlock(&inode_lock);
__wait_on_inode(inode);
iput(inode);
spin_lock(&inode_lock);
}
}
/* /*
* writeback and wait upon the filesystem's dirty inodes. The caller will * writeback and wait upon the filesystem's dirty inodes. The caller will
* do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is
...@@ -384,8 +374,6 @@ void sync_inodes_sb(struct super_block *sb, int wait) ...@@ -384,8 +374,6 @@ void sync_inodes_sb(struct super_block *sb, int wait)
spin_lock(&inode_lock); spin_lock(&inode_lock);
sync_sb_inodes(NULL, sb, wait ? WB_SYNC_ALL : WB_SYNC_HOLD, sync_sb_inodes(NULL, sb, wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
&nr_to_write, NULL); &nr_to_write, NULL);
if (wait)
__wait_on_locked(&sb->s_locked_inodes);
spin_unlock(&inode_lock); spin_unlock(&inode_lock);
} }
......
...@@ -321,7 +321,6 @@ int invalidate_inodes(struct super_block * sb) ...@@ -321,7 +321,6 @@ int invalidate_inodes(struct super_block * sb)
busy |= invalidate_list(&inode_unused, sb, &throw_away); busy |= invalidate_list(&inode_unused, sb, &throw_away);
busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
busy |= invalidate_list(&sb->s_io, sb, &throw_away); busy |= invalidate_list(&sb->s_io, sb, &throw_away);
busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away);
spin_unlock(&inode_lock); spin_unlock(&inode_lock);
dispose_list(&throw_away); dispose_list(&throw_away);
...@@ -999,11 +998,6 @@ void remove_dquot_ref(struct super_block *sb, int type) ...@@ -999,11 +998,6 @@ void remove_dquot_ref(struct super_block *sb, int type)
if (IS_QUOTAINIT(inode)) if (IS_QUOTAINIT(inode))
remove_inode_dquot_ref(inode, type, &tofree_head); remove_inode_dquot_ref(inode, type, &tofree_head);
} }
list_for_each(act_head, &sb->s_locked_inodes) {
inode = list_entry(act_head, struct inode, i_list);
if (IS_QUOTAINIT(inode))
remove_inode_dquot_ref(inode, type, &tofree_head);
}
spin_unlock(&inode_lock); spin_unlock(&inode_lock);
unlock_kernel(); unlock_kernel();
......
...@@ -599,9 +599,8 @@ mpage_writepages(struct address_space *mapping, ...@@ -599,9 +599,8 @@ mpage_writepages(struct address_space *mapping,
write_lock(&mapping->page_lock); write_lock(&mapping->page_lock);
} }
/* /*
* Put the rest back, in the correct order. * Leave any remaining dirty pages on ->io_pages
*/ */
list_splice_init(&mapping->io_pages, mapping->dirty_pages.prev);
write_unlock(&mapping->page_lock); write_unlock(&mapping->page_lock);
pagevec_deactivate_inactive(&pvec); pagevec_deactivate_inactive(&pvec);
if (bio) if (bio)
......
...@@ -58,7 +58,6 @@ static struct super_block *alloc_super(void) ...@@ -58,7 +58,6 @@ static struct super_block *alloc_super(void)
} }
INIT_LIST_HEAD(&s->s_dirty); INIT_LIST_HEAD(&s->s_dirty);
INIT_LIST_HEAD(&s->s_io); INIT_LIST_HEAD(&s->s_io);
INIT_LIST_HEAD(&s->s_locked_inodes);
INIT_LIST_HEAD(&s->s_files); INIT_LIST_HEAD(&s->s_files);
INIT_LIST_HEAD(&s->s_instances); INIT_LIST_HEAD(&s->s_instances);
INIT_LIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_anon);
......
...@@ -660,7 +660,6 @@ struct super_block { ...@@ -660,7 +660,6 @@ struct super_block {
struct list_head s_dirty; /* dirty inodes */ struct list_head s_dirty; /* dirty inodes */
struct list_head s_io; /* parked for writeback */ struct list_head s_io; /* parked for writeback */
struct list_head s_locked_inodes;/* inodes being synced */
struct list_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_anon; /* anonymous dentries for (nfs) exporting */
struct list_head s_files; struct list_head s_files;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment