Commit 610c5ab8 authored by Andrew Morton's avatar Andrew Morton Committed by Arnaldo Carvalho de Melo

[PATCH] dirty inode management

Fix the "race with umount" in __sync_list().  __sync_list() no longer
puts inodes onto a local list while writing them out.

The super_block.sb_dirty list is kept time-ordered.  Mappings which
have the "oldest" ->dirtied_when are kept at sb->s_dirty.prev.

So the time-based writeback (kupdate) can just bale out when it
encounters a not-old-enough mapping, rather than walking the entire
list.

dirtied_when is set on the *first* dirtying of a mapping.  So once the
mapping is marked dirty it strictly retains its place on s_dirty until
it reaches the oldest end and is written out.  So frequently-dirtied
mappings don't stay dirty at the head of the list for all time.

That local inode list was there for livelock avoidance.  Livelock is
instead avoided by looking at each mapping's ->dirtied_when.  If we
encounter one which was dirtied after this invokation of __sync_list(),
then just bale out - the sync functions are only required to write out
data which was dirty at the time when they were called.

Keeping the s_dirty list in time-order is the right thing to do anyway
- so all the various writeback callers always work against the oldest
data.
parent 2d8f24d0
...@@ -62,8 +62,14 @@ void __mark_inode_dirty(struct inode *inode, int flags) ...@@ -62,8 +62,14 @@ void __mark_inode_dirty(struct inode *inode, int flags)
spin_lock(&inode_lock); spin_lock(&inode_lock);
if ((inode->i_state & flags) != flags) { if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;
struct address_space *mapping = inode->i_mapping;
inode->i_state |= flags; inode->i_state |= flags;
if (!was_dirty)
mapping->dirtied_when = jiffies;
/* /*
* If the inode is locked, just update its dirty state. * If the inode is locked, just update its dirty state.
* The unlocker will place the inode on the appropriate * The unlocker will place the inode on the appropriate
...@@ -78,11 +84,16 @@ void __mark_inode_dirty(struct inode *inode, int flags) ...@@ -78,11 +84,16 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/ */
if (list_empty(&inode->i_hash) && !S_ISBLK(inode->i_mode)) if (list_empty(&inode->i_hash) && !S_ISBLK(inode->i_mode))
goto same_list; goto same_list;
if (inode->i_mapping->dirtied_when == 0)
inode->i_mapping->dirtied_when = jiffies; /*
* If the inode was already on s_dirty, don't reposition
* it (that would break s_dirty time-ordering).
*/
if (!was_dirty) {
list_del(&inode->i_list); list_del(&inode->i_list);
list_add(&inode->i_list, &sb->s_dirty); list_add(&inode->i_list, &sb->s_dirty);
} }
}
same_list: same_list:
spin_unlock(&inode_lock); spin_unlock(&inode_lock);
} }
...@@ -116,18 +127,20 @@ static inline void write_inode(struct inode *inode, int sync) ...@@ -116,18 +127,20 @@ static inline void write_inode(struct inode *inode, int sync)
static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write) static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
{ {
unsigned dirty; unsigned dirty;
unsigned long orig_dirtied_when;
struct address_space *mapping = inode->i_mapping; struct address_space *mapping = inode->i_mapping;
list_del(&inode->i_list); list_del(&inode->i_list);
list_add(&inode->i_list, &inode->i_sb->s_locked_inodes); list_add(&inode->i_list, &inode->i_sb->s_locked_inodes);
if (inode->i_state & I_LOCK) BUG_ON(inode->i_state & I_LOCK);
BUG();
/* Set I_LOCK, reset I_DIRTY */ /* Set I_LOCK, reset I_DIRTY */
dirty = inode->i_state & I_DIRTY; dirty = inode->i_state & I_DIRTY;
inode->i_state |= I_LOCK; inode->i_state |= I_LOCK;
inode->i_state &= ~I_DIRTY; inode->i_state &= ~I_DIRTY;
orig_dirtied_when = mapping->dirtied_when;
mapping->dirtied_when = 0; /* assume it's whole-file writeback */
spin_unlock(&inode_lock); spin_unlock(&inode_lock);
if (wait) if (wait)
...@@ -145,35 +158,25 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write) ...@@ -145,35 +158,25 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
if (wait) if (wait)
filemap_fdatawait(mapping); filemap_fdatawait(mapping);
/*
* For non-blocking writeout (wait == 0), we still
* count the inode as being clean.
*/
spin_lock(&inode_lock); spin_lock(&inode_lock);
/*
* Did we write back all the pages?
*/
if (nr_to_write && *nr_to_write == 0) {
/*
* Maybe not
*/
if (!list_empty(&mapping->dirty_pages)) /* No lock needed */
inode->i_state |= I_DIRTY_PAGES;
}
inode->i_state &= ~I_LOCK; inode->i_state &= ~I_LOCK;
if (!(inode->i_state & I_FREEING)) { if (!(inode->i_state & I_FREEING)) {
struct list_head *to;
if (inode->i_state & I_DIRTY)
to = &inode->i_sb->s_dirty;
else if (atomic_read(&inode->i_count))
to = &inode_in_use;
else
to = &inode_unused;
list_del(&inode->i_list); list_del(&inode->i_list);
list_add(&inode->i_list, to); if (!list_empty(&mapping->dirty_pages)) {
/* Not a whole-file writeback */
mapping->dirtied_when = orig_dirtied_when;
inode->i_state |= I_DIRTY_PAGES;
list_add_tail(&inode->i_list, &inode->i_sb->s_dirty);
} else if (inode->i_state & I_DIRTY) {
list_add(&inode->i_list, &inode->i_sb->s_dirty);
} else if (atomic_read(&inode->i_count)) {
list_add(&inode->i_list, &inode_in_use);
} else {
list_add(&inode->i_list, &inode_unused);
} }
}
if (waitqueue_active(&inode->i_wait))
wake_up(&inode->i_wait); wake_up(&inode->i_wait);
} }
...@@ -201,38 +204,34 @@ void writeback_single_inode(struct inode *inode, int sync, int *nr_to_write) ...@@ -201,38 +204,34 @@ void writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
} }
/* /*
* Write out a list of inodes' pages, and the inode itself. * Write out a list of dirty inodes.
* *
* If `sync' is true, wait on writeout of the last mapping * If `sync' is true, wait on writeout of the last mapping which we write.
* which we write.
* *
* If older_than_this is non-NULL, then only write out mappings which * If older_than_this is non-NULL, then only write out mappings which
* had their first dirtying at a time earlier than *older_than_this. * had their first dirtying at a time earlier than *older_than_this.
* *
* Called under inode_lock. * Called under inode_lock.
*
* FIXME: putting all the inodes on a local list could introduce a
* race with umount. Bump the superblock refcount?
*/ */
static void __sync_list(struct list_head *head, int sync_mode, static void __sync_list(struct list_head *head, int sync_mode,
int *nr_to_write, unsigned long *older_than_this) int *nr_to_write, unsigned long *older_than_this)
{ {
struct list_head * tmp; struct list_head *tmp;
LIST_HEAD(hold); /* Unready inodes go here */ const unsigned long start = jiffies; /* livelock avoidance */
while ((tmp = head->prev) != head) { while ((tmp = head->prev) != head) {
struct inode *inode = list_entry(tmp, struct inode, i_list); struct inode *inode = list_entry(tmp, struct inode, i_list);
struct address_space *mapping = inode->i_mapping; struct address_space *mapping = inode->i_mapping;
int really_sync; int really_sync;
if (older_than_this && *older_than_this) { /* Was this inode dirtied after __sync_list was called? */
if (time_after(mapping->dirtied_when, if (time_after(mapping->dirtied_when, start))
*older_than_this)) { break;
list_del(&inode->i_list);
list_add(&inode->i_list, &hold); if (older_than_this &&
continue; time_after(mapping->dirtied_when, *older_than_this))
} break;
}
really_sync = (sync_mode == WB_SYNC_ALL); really_sync = (sync_mode == WB_SYNC_ALL);
if ((sync_mode == WB_SYNC_LAST) && (head->prev == head)) if ((sync_mode == WB_SYNC_LAST) && (head->prev == head))
really_sync = 1; really_sync = 1;
...@@ -240,11 +239,7 @@ static void __sync_list(struct list_head *head, int sync_mode, ...@@ -240,11 +239,7 @@ static void __sync_list(struct list_head *head, int sync_mode,
if (nr_to_write && *nr_to_write == 0) if (nr_to_write && *nr_to_write == 0)
break; break;
} }
/* return;
* Put the not-ready inodes back
*/
if (!list_empty(&hold))
list_splice(&hold, head);
} }
/* /*
...@@ -258,8 +253,7 @@ static void __sync_list(struct list_head *head, int sync_mode, ...@@ -258,8 +253,7 @@ static void __sync_list(struct list_head *head, int sync_mode,
* inode from superblock lists we are OK. * inode from superblock lists we are OK.
* *
* If `older_than_this' is non-zero then only flush inodes which have a * If `older_than_this' is non-zero then only flush inodes which have a
* flushtime older than *older_than_this. Unless *older_than_this is * flushtime older than *older_than_this.
* zero. In which case we flush everything, like the old (dumb) wakeup_bdflush.
*/ */
void writeback_unlocked_inodes(int *nr_to_write, int sync_mode, void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
unsigned long *older_than_this) unsigned long *older_than_this)
...@@ -434,11 +428,13 @@ void try_to_writeback_unused_inodes(unsigned long pexclusive) ...@@ -434,11 +428,13 @@ void try_to_writeback_unused_inodes(unsigned long pexclusive)
spin_lock(&inode_lock); spin_lock(&inode_lock);
spin_lock(&sb_lock); spin_lock(&sb_lock);
sb = sb_entry(super_blocks.next); sb = sb_entry(super_blocks.next);
for (; nr_inodes && sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) { for (; nr_inodes && sb != sb_entry(&super_blocks);
sb = sb_entry(sb->s_list.next)) {
if (list_empty(&sb->s_dirty)) if (list_empty(&sb->s_dirty))
continue; continue;
spin_unlock(&sb_lock); spin_unlock(&sb_lock);
nr_inodes = __try_to_writeback_unused_list(&sb->s_dirty, nr_inodes); nr_inodes = __try_to_writeback_unused_list(&sb->s_dirty,
nr_inodes);
spin_lock(&sb_lock); spin_lock(&sb_lock);
} }
spin_unlock(&sb_lock); spin_unlock(&sb_lock);
......
...@@ -175,10 +175,6 @@ static int wb_writeback_jifs = 5 * HZ; ...@@ -175,10 +175,6 @@ static int wb_writeback_jifs = 5 * HZ;
* just walks the superblock inode list, writing back any inodes which are * just walks the superblock inode list, writing back any inodes which are
* older than a specific point in time. * older than a specific point in time.
* *
* Spot the bug: at jiffies wraparound, the attempt to set the inode's dirtying
* time won't work, because zero means not-dirty. That's OK. The data will get
* written out later by the VM (at least).
*
* We also limit the number of pages which are written out, to avoid writing * We also limit the number of pages which are written out, to avoid writing
* huge amounts of data against a single file, which would cause memory * huge amounts of data against a single file, which would cause memory
* allocators to block for too long. * allocators to block for too long.
...@@ -328,7 +324,6 @@ int generic_writeback_mapping(struct address_space *mapping, int *nr_to_write) ...@@ -328,7 +324,6 @@ int generic_writeback_mapping(struct address_space *mapping, int *nr_to_write)
list_splice(&mapping->dirty_pages, &mapping->io_pages); list_splice(&mapping->dirty_pages, &mapping->io_pages);
INIT_LIST_HEAD(&mapping->dirty_pages); INIT_LIST_HEAD(&mapping->dirty_pages);
mapping->dirtied_when = 0;
while (!list_empty(&mapping->io_pages) && !done) { while (!list_empty(&mapping->io_pages) && !done) {
struct page *page = list_entry(mapping->io_pages.prev, struct page *page = list_entry(mapping->io_pages.prev,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment