Commit 535ee2fb authored by Jan Kara's avatar Jan Kara Committed by Linus Torvalds

buffer_head: fix private_list handling

There are two possible races in handling of private_list in buffer cache.

1) When fsync_buffers_list() processes a private_list, it clears
   b_assoc_mapping and moves buffer to its private list.  Now
   drop_buffers() comes, sees a buffer is on list so it calls
   __remove_assoc_queue() which complains about b_assoc_mapping being
   cleared (as it cannot propagate possible IO error).  This race has been
   actually observed in the wild.

2) When fsync_buffers_list() processes a private_list,
   mark_buffer_dirty_inode() can be called on bh which is already on the
   private list of fsync_buffers_list().  As buffer is on some list (note
   that the check is performed without private_lock), it is not readded to
   the mapping's private_list and after fsync_buffers_list() finishes, we
   have a dirty buffer which should be on private_list but it isn't.  This
   race has not been reported, probably because most (but not all) callers
   of mark_buffer_dirty_inode() hold i_mutex and thus are serialized with
   fsync().

Fix these issues by not clearing b_assoc_map when fsync_buffers_list()
moves buffer to a dedicated list and by reinserting buffer in private_list
when it is found dirty after we have submitted buffer for IO.  We also
change the tests whether a buffer is on a private list from
!list_empty(&bh->b_assoc_buffers) to bh->b_assoc_map so that they are
single word reads and hence lockless checks are safe.
Signed-off-by: default avatarJan Kara <jack@suse.cz>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent f6f21c81
...@@ -678,7 +678,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) ...@@ -678,7 +678,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
} else { } else {
BUG_ON(mapping->assoc_mapping != buffer_mapping); BUG_ON(mapping->assoc_mapping != buffer_mapping);
} }
if (list_empty(&bh->b_assoc_buffers)) { if (!bh->b_assoc_map) {
spin_lock(&buffer_mapping->private_lock); spin_lock(&buffer_mapping->private_lock);
list_move_tail(&bh->b_assoc_buffers, list_move_tail(&bh->b_assoc_buffers,
&mapping->private_list); &mapping->private_list);
...@@ -794,6 +794,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) ...@@ -794,6 +794,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
{ {
struct buffer_head *bh; struct buffer_head *bh;
struct list_head tmp; struct list_head tmp;
struct address_space *mapping;
int err = 0, err2; int err = 0, err2;
INIT_LIST_HEAD(&tmp); INIT_LIST_HEAD(&tmp);
...@@ -801,9 +802,14 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) ...@@ -801,9 +802,14 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
spin_lock(lock); spin_lock(lock);
while (!list_empty(list)) { while (!list_empty(list)) {
bh = BH_ENTRY(list->next); bh = BH_ENTRY(list->next);
mapping = bh->b_assoc_map;
__remove_assoc_queue(bh); __remove_assoc_queue(bh);
/* Avoid race with mark_buffer_dirty_inode() which does
* a lockless check and we rely on seeing the dirty bit */
smp_mb();
if (buffer_dirty(bh) || buffer_locked(bh)) { if (buffer_dirty(bh) || buffer_locked(bh)) {
list_add(&bh->b_assoc_buffers, &tmp); list_add(&bh->b_assoc_buffers, &tmp);
bh->b_assoc_map = mapping;
if (buffer_dirty(bh)) { if (buffer_dirty(bh)) {
get_bh(bh); get_bh(bh);
spin_unlock(lock); spin_unlock(lock);
...@@ -822,8 +828,17 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) ...@@ -822,8 +828,17 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
while (!list_empty(&tmp)) { while (!list_empty(&tmp)) {
bh = BH_ENTRY(tmp.prev); bh = BH_ENTRY(tmp.prev);
list_del_init(&bh->b_assoc_buffers);
get_bh(bh); get_bh(bh);
mapping = bh->b_assoc_map;
__remove_assoc_queue(bh);
/* Avoid race with mark_buffer_dirty_inode() which does
* a lockless check and we rely on seeing the dirty bit */
smp_mb();
if (buffer_dirty(bh)) {
list_add(&bh->b_assoc_buffers,
&bh->b_assoc_map->private_list);
bh->b_assoc_map = mapping;
}
spin_unlock(lock); spin_unlock(lock);
wait_on_buffer(bh); wait_on_buffer(bh);
if (!buffer_uptodate(bh)) if (!buffer_uptodate(bh))
...@@ -1195,7 +1210,7 @@ void __brelse(struct buffer_head * buf) ...@@ -1195,7 +1210,7 @@ void __brelse(struct buffer_head * buf)
void __bforget(struct buffer_head *bh) void __bforget(struct buffer_head *bh)
{ {
clear_buffer_dirty(bh); clear_buffer_dirty(bh);
if (!list_empty(&bh->b_assoc_buffers)) { if (bh->b_assoc_map) {
struct address_space *buffer_mapping = bh->b_page->mapping; struct address_space *buffer_mapping = bh->b_page->mapping;
spin_lock(&buffer_mapping->private_lock); spin_lock(&buffer_mapping->private_lock);
...@@ -3022,7 +3037,7 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free) ...@@ -3022,7 +3037,7 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
do { do {
struct buffer_head *next = bh->b_this_page; struct buffer_head *next = bh->b_this_page;
if (!list_empty(&bh->b_assoc_buffers)) if (bh->b_assoc_map)
__remove_assoc_queue(bh); __remove_assoc_queue(bh);
bh = next; bh = next;
} while (bh != head); } while (bh != head);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment