Commit d58e41ee authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] Fix concurrent writepage and readpage

Pages under writeback are not locked.  So it is possible (and quite
legal) for a page to be under readpage() while it is still under
writeback.  For a partially uptodate page with blocksize <
PAGE_CACHE_SIZE.

When this happens, the read and write I/O completion handlers get
confused over the shared BH_Async usage and the page ends up not
getting PG_writeback cleared.  Truncate gets stuck in D state.

The patch separates the read and write I/O completion state.

It also shuffles the buffer fields around.  Putting the
commonly-accessed b_state at offset zero shrinks the kernel by a few
hundred bytes because it can be accessed with indirect addressing, not
indirect+indexed.
parent 4bcb6689
...@@ -1598,9 +1598,11 @@ int submit_bh(int rw, struct buffer_head * bh) ...@@ -1598,9 +1598,11 @@ int submit_bh(int rw, struct buffer_head * bh)
BUG_ON(!bh->b_end_io); BUG_ON(!bh->b_end_io);
if ((rw == READ || rw == READA) && buffer_uptodate(bh)) if ((rw == READ || rw == READA) && buffer_uptodate(bh))
printk("%s: read of uptodate buffer\n", __FUNCTION__); buffer_error();
if (rw == WRITE && !buffer_uptodate(bh)) if (rw == WRITE && !buffer_uptodate(bh))
printk("%s: write of non-uptodate buffer\n", __FUNCTION__); buffer_error();
if (rw == READ && buffer_dirty(bh))
buffer_error();
set_buffer_req(bh); set_buffer_req(bh);
......
...@@ -186,6 +186,12 @@ __clear_page_buffers(struct page *page) ...@@ -186,6 +186,12 @@ __clear_page_buffers(struct page *page)
page_cache_release(page); page_cache_release(page);
} }
static void buffer_io_error(struct buffer_head *bh)
{
printk(KERN_ERR "Buffer I/O error on device %s, logical block %ld\n",
bdevname(bh->b_bdev), bh->b_blocknr);
}
/* /*
* Default synchronous end-of-IO handler.. Just mark it up-to-date and * Default synchronous end-of-IO handler.. Just mark it up-to-date and
* unlock the buffer. This is what ll_rw_block uses too. * unlock the buffer. This is what ll_rw_block uses too.
...@@ -193,7 +199,7 @@ __clear_page_buffers(struct page *page) ...@@ -193,7 +199,7 @@ __clear_page_buffers(struct page *page)
void end_buffer_io_sync(struct buffer_head *bh, int uptodate) void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{ {
if (!uptodate) if (!uptodate)
printk("%s: I/O error\n", __FUNCTION__); buffer_io_error(bh);
if (uptodate) if (uptodate)
set_buffer_uptodate(bh); set_buffer_uptodate(bh);
else else
...@@ -537,7 +543,11 @@ static void free_more_memory(void) ...@@ -537,7 +543,11 @@ static void free_more_memory(void)
yield(); yield();
} }
static void end_buffer_io_async(struct buffer_head *bh, int uptodate) /*
* I/O completion handler for block_read_full_page() and brw_page() - pages
* which come unlocked at the end of I/O.
*/
static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{ {
static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED; static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
unsigned long flags; unsigned long flags;
...@@ -545,16 +555,18 @@ static void end_buffer_io_async(struct buffer_head *bh, int uptodate) ...@@ -545,16 +555,18 @@ static void end_buffer_io_async(struct buffer_head *bh, int uptodate)
struct page *page; struct page *page;
int page_uptodate = 1; int page_uptodate = 1;
BUG_ON(!buffer_async_read(bh));
if (!uptodate) if (!uptodate)
printk("%s: I/O error\n", __FUNCTION__); buffer_io_error(bh);
if (uptodate) page = bh->b_page;
if (uptodate) {
set_buffer_uptodate(bh); set_buffer_uptodate(bh);
else } else {
clear_buffer_uptodate(bh); clear_buffer_uptodate(bh);
page = bh->b_page;
if (!uptodate)
SetPageError(page); SetPageError(page);
}
/* /*
* Be _very_ careful from here on. Bad things can happen if * Be _very_ careful from here on. Bad things can happen if
...@@ -562,13 +574,13 @@ static void end_buffer_io_async(struct buffer_head *bh, int uptodate) ...@@ -562,13 +574,13 @@ static void end_buffer_io_async(struct buffer_head *bh, int uptodate)
* decide that the page is now completely done. * decide that the page is now completely done.
*/ */
spin_lock_irqsave(&page_uptodate_lock, flags); spin_lock_irqsave(&page_uptodate_lock, flags);
clear_buffer_async(bh); clear_buffer_async_read(bh);
unlock_buffer(bh); unlock_buffer(bh);
tmp = bh; tmp = bh;
do { do {
if (!buffer_uptodate(tmp)) if (!buffer_uptodate(tmp))
page_uptodate = 0; page_uptodate = 0;
if (buffer_async(tmp)) { if (buffer_async_read(tmp)) {
if (buffer_locked(tmp)) if (buffer_locked(tmp))
goto still_busy; goto still_busy;
if (!buffer_mapped(bh)) if (!buffer_mapped(bh))
...@@ -584,13 +596,53 @@ static void end_buffer_io_async(struct buffer_head *bh, int uptodate) ...@@ -584,13 +596,53 @@ static void end_buffer_io_async(struct buffer_head *bh, int uptodate)
*/ */
if (page_uptodate && !PageError(page)) if (page_uptodate && !PageError(page))
SetPageUptodate(page); SetPageUptodate(page);
if (PageWriteback(page)) {
/* It was a write */
end_page_writeback(page);
} else {
/* read */
unlock_page(page); unlock_page(page);
return;
still_busy:
spin_unlock_irqrestore(&page_uptodate_lock, flags);
return;
}
/*
* Completion handler for block_write_full_page() - pages which are unlocked
* during I/O, and which have PageWriteback cleared upon I/O completion.
*/
static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
unsigned long flags;
struct buffer_head *tmp;
struct page *page;
BUG_ON(!buffer_async_write(bh));
if (!uptodate)
buffer_io_error(bh);
page = bh->b_page;
if (uptodate) {
set_buffer_uptodate(bh);
} else {
clear_buffer_uptodate(bh);
SetPageError(page);
}
spin_lock_irqsave(&page_uptodate_lock, flags);
clear_buffer_async_write(bh);
unlock_buffer(bh);
tmp = bh->b_this_page;
while (tmp != bh) {
if (buffer_async_write(tmp)) {
if (buffer_locked(tmp))
goto still_busy;
if (!buffer_mapped(bh))
BUG();
}
tmp = tmp->b_this_page;
} }
spin_unlock_irqrestore(&page_uptodate_lock, flags);
end_page_writeback(page);
return; return;
still_busy: still_busy:
...@@ -599,26 +651,39 @@ static void end_buffer_io_async(struct buffer_head *bh, int uptodate) ...@@ -599,26 +651,39 @@ static void end_buffer_io_async(struct buffer_head *bh, int uptodate)
} }
/* /*
* If a page's buffers are under async writeout (end_buffer_io_async * If a page's buffers are under async readin (end_buffer_async_read
* completion) then there is a possibility that another thread of * completion) then there is a possibility that another thread of
* control could lock one of the buffers after it has completed * control could lock one of the buffers after it has completed
* but while some of the other buffers have not completed. This * but while some of the other buffers have not completed. This
* locked buffer would confuse end_buffer_io_async() into not unlocking * locked buffer would confuse end_buffer_async_read() into not unlocking
* the page. So the absence of BH_Async tells end_buffer_io_async() * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
* that this buffer is not under async I/O. * that this buffer is not under async I/O.
* *
* The page comes unlocked when it has no locked buffer_async buffers * The page comes unlocked when it has no locked buffer_async buffers
* left. * left.
* *
* The page lock prevents anyone starting new async I/O against any of * PageLocked prevents anyone starting new async I/O reads any of
* the buffers. * the buffers.
*
* PageWriteback is used to prevent simultaneous writeout of the same
* page.
*
* PageLocked prevents anyone from starting writeback of a page which is
* under read I/O (PageWriteback is only ever set against a locked page).
*/ */
inline void set_buffer_async_io(struct buffer_head *bh) inline void mark_buffer_async_read(struct buffer_head *bh)
{
bh->b_end_io = end_buffer_async_read;
set_buffer_async_read(bh);
}
EXPORT_SYMBOL(mark_buffer_async_read);
inline void mark_buffer_async_write(struct buffer_head *bh)
{ {
bh->b_end_io = end_buffer_io_async; bh->b_end_io = end_buffer_async_write;
set_buffer_async(bh); set_buffer_async_write(bh);
} }
EXPORT_SYMBOL(set_buffer_async_io); EXPORT_SYMBOL(mark_buffer_async_write);
/* /*
* osync is designed to support O_SYNC io. It waits synchronously for * osync is designed to support O_SYNC io. It waits synchronously for
...@@ -1207,10 +1272,13 @@ void create_empty_buffers(struct page *page, ...@@ -1207,10 +1272,13 @@ void create_empty_buffers(struct page *page,
tail->b_this_page = head; tail->b_this_page = head;
spin_lock(&page->mapping->host->i_bufferlist_lock); spin_lock(&page->mapping->host->i_bufferlist_lock);
if (PageDirty(page)) { if (PageUptodate(page) || PageDirty(page)) {
bh = head; bh = head;
do { do {
if (PageDirty(page))
set_buffer_dirty(bh); set_buffer_dirty(bh);
if (PageUptodate(page))
set_buffer_uptodate(bh);
bh = bh->b_this_page; bh = bh->b_this_page;
} while (bh != head); } while (bh != head);
} }
...@@ -1308,12 +1376,18 @@ static int __block_write_full_page(struct inode *inode, ...@@ -1308,12 +1376,18 @@ static int __block_write_full_page(struct inode *inode,
*/ */
do { do {
if (block > last_block) { if (block > last_block) {
clear_buffer_dirty(bh); /*
if (buffer_mapped(bh)) * mapped buffers outside i_size will occur, because
buffer_error(); * this page can be outside i_size when there is a
* truncate in progress.
*
* if (buffer_mapped(bh))
* buffer_error();
*/
/* /*
* The buffer was zeroed by block_write_full_page() * The buffer was zeroed by block_write_full_page()
*/ */
clear_buffer_dirty(bh);
set_buffer_uptodate(bh); set_buffer_uptodate(bh);
} else if (!buffer_mapped(bh) && buffer_dirty(bh)) { } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
if (buffer_new(bh)) if (buffer_new(bh))
...@@ -1338,7 +1412,7 @@ static int __block_write_full_page(struct inode *inode, ...@@ -1338,7 +1412,7 @@ static int __block_write_full_page(struct inode *inode,
if (test_clear_buffer_dirty(bh)) { if (test_clear_buffer_dirty(bh)) {
if (!buffer_uptodate(bh)) if (!buffer_uptodate(bh))
buffer_error(); buffer_error();
set_buffer_async_io(bh); mark_buffer_async_write(bh);
} else { } else {
unlock_buffer(bh); unlock_buffer(bh);
} }
...@@ -1356,7 +1430,7 @@ static int __block_write_full_page(struct inode *inode, ...@@ -1356,7 +1430,7 @@ static int __block_write_full_page(struct inode *inode,
*/ */
do { do {
struct buffer_head *next = bh->b_this_page; struct buffer_head *next = bh->b_this_page;
if (buffer_async(bh)) { if (buffer_async_write(bh)) {
submit_bh(WRITE, bh); submit_bh(WRITE, bh);
nr_underway++; nr_underway++;
} }
...@@ -1398,7 +1472,7 @@ static int __block_write_full_page(struct inode *inode, ...@@ -1398,7 +1472,7 @@ static int __block_write_full_page(struct inode *inode,
do { do {
if (buffer_mapped(bh)) { if (buffer_mapped(bh)) {
lock_buffer(bh); lock_buffer(bh);
set_buffer_async_io(bh); mark_buffer_async_write(bh);
} else { } else {
/* /*
* The buffer may have been set dirty during * The buffer may have been set dirty during
...@@ -1410,7 +1484,7 @@ static int __block_write_full_page(struct inode *inode, ...@@ -1410,7 +1484,7 @@ static int __block_write_full_page(struct inode *inode,
} while (bh != head); } while (bh != head);
do { do {
struct buffer_head *next = bh->b_this_page; struct buffer_head *next = bh->b_this_page;
if (buffer_mapped(bh)) { if (buffer_async_write(bh)) {
set_buffer_uptodate(bh); set_buffer_uptodate(bh);
clear_buffer_dirty(bh); clear_buffer_dirty(bh);
submit_bh(WRITE, bh); submit_bh(WRITE, bh);
...@@ -1631,13 +1705,9 @@ int block_read_full_page(struct page *page, get_block_t *get_block) ...@@ -1631,13 +1705,9 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
/* Stage two: lock the buffers */ /* Stage two: lock the buffers */
for (i = 0; i < nr; i++) { for (i = 0; i < nr; i++) {
struct buffer_head * bh = arr[i]; bh = arr[i];
lock_buffer(bh); lock_buffer(bh);
if (buffer_uptodate(bh)) mark_buffer_async_read(bh);
buffer_error();
if (buffer_dirty(bh))
buffer_error();
set_buffer_async_io(bh);
} }
/* /*
...@@ -1646,9 +1716,9 @@ int block_read_full_page(struct page *page, get_block_t *get_block) ...@@ -1646,9 +1716,9 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
* the underlying blockdev brought it uptodate (the sct fix). * the underlying blockdev brought it uptodate (the sct fix).
*/ */
for (i = 0; i < nr; i++) { for (i = 0; i < nr; i++) {
struct buffer_head * bh = arr[i]; bh = arr[i];
if (buffer_uptodate(bh)) if (buffer_uptodate(bh))
end_buffer_io_async(bh, 1); end_buffer_async_read(bh, 1);
else else
submit_bh(READ, bh); submit_bh(READ, bh);
} }
...@@ -2074,9 +2144,15 @@ int brw_page(int rw, struct page *page, ...@@ -2074,9 +2144,15 @@ int brw_page(int rw, struct page *page,
bh->b_blocknr = *(b++); bh->b_blocknr = *(b++);
bh->b_bdev = bdev; bh->b_bdev = bdev;
set_buffer_mapped(bh); set_buffer_mapped(bh);
if (rw == WRITE) if (rw == WRITE) {
set_buffer_uptodate(bh); set_buffer_uptodate(bh);
set_buffer_async_io(bh); clear_buffer_dirty(bh);
}
/*
* Swap pages are locked during writeout, so use
* buffer_async_read in strange ways.
*/
mark_buffer_async_read(bh);
bh = bh->b_this_page; bh = bh->b_this_page;
} while (bh != head); } while (bh != head);
......
...@@ -267,7 +267,7 @@ void journal_commit_transaction(journal_t *journal) ...@@ -267,7 +267,7 @@ void journal_commit_transaction(journal_t *journal)
sync_datalist_empty: sync_datalist_empty:
/* /*
* Wait for all the async writepage data. As they become unlocked * Wait for all the async writepage data. As they become unlocked
* in end_buffer_io_async(), the only place where they can be * in end_buffer_async_write(), the only place where they can be
* reaped is in try_to_free_buffers(), and we're locked against * reaped is in try_to_free_buffers(), and we're locked against
* that. * that.
*/ */
......
...@@ -905,8 +905,8 @@ int journal_get_undo_access (handle_t *handle, struct buffer_head *bh) ...@@ -905,8 +905,8 @@ int journal_get_undo_access (handle_t *handle, struct buffer_head *bh)
* The buffer is placed on the transaction's data list and is marked as * The buffer is placed on the transaction's data list and is marked as
* belonging to the transaction. * belonging to the transaction.
* *
* If `async' is set then the writebask will be initiated by the caller * If `async' is set then the writebabk will be initiated by the caller
* using submit_bh -> end_buffer_io_async. We put the buffer onto * using submit_bh -> end_buffer_async_write. We put the buffer onto
* t_async_datalist. * t_async_datalist.
* *
* Returns error number or 0 on success. * Returns error number or 0 on success.
...@@ -1851,7 +1851,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) ...@@ -1851,7 +1851,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
} }
zap_buffer: zap_buffer:
if (buffer_dirty(bh))
clear_buffer_dirty(bh); clear_buffer_dirty(bh);
J_ASSERT_BH(bh, !buffer_jdirty(bh)); J_ASSERT_BH(bh, !buffer_jdirty(bh));
clear_buffer_mapped(bh); clear_buffer_mapped(bh);
......
...@@ -1916,7 +1916,7 @@ static inline void submit_bh_for_writepage(struct buffer_head **bhp, int nr) { ...@@ -1916,7 +1916,7 @@ static inline void submit_bh_for_writepage(struct buffer_head **bhp, int nr) {
for(i = 0 ; i < nr ; i++) { for(i = 0 ; i < nr ; i++) {
bh = bhp[i] ; bh = bhp[i] ;
lock_buffer(bh) ; lock_buffer(bh) ;
set_buffer_async_io(bh) ; mark_buffer_async_write(bh) ;
/* submit_bh doesn't care if the buffer is dirty, but nobody /* submit_bh doesn't care if the buffer is dirty, but nobody
** later on in the call chain will be cleaning it. So, we ** later on in the call chain will be cleaning it. So, we
** clean the buffer here, it still gets written either way. ** clean the buffer here, it still gets written either way.
......
/* /*
* include/linux/buffer_head.h * include/linux/buffer_head.h
* *
* Everything to do with buffer_head.b_state. * Everything to do with buffer_heads.
*/ */
#ifndef BUFFER_FLAGS_H #ifndef BUFFER_FLAGS_H
#define BUFFER_FLAGS_H #define BUFFER_FLAGS_H
/* bh state bits */
enum bh_state_bits { enum bh_state_bits {
BH_Uptodate, /* 1 if the buffer contains valid data */ BH_Uptodate, /* Contains valid data */
BH_Dirty, /* 1 if the buffer is dirty */ BH_Dirty, /* Is dirty */
BH_Lock, /* 1 if the buffer is locked */ BH_Lock, /* Is locked */
BH_Req, /* 0 if the buffer has been invalidated */ BH_Req, /* Has been submitted for I/O */
BH_Mapped, /* 1 if the buffer has a disk mapping */ BH_Mapped, /* Has a disk mapping */
BH_New, /* 1 if the buffer is new and not yet written out */ BH_New, /* Disk mapping was newly created by get_block */
BH_Async, /* 1 if the buffer is under end_buffer_io_async I/O */ BH_Async_Read, /* Is under end_buffer_async_read I/O */
BH_JBD, /* 1 if it has an attached journal_head */ BH_Async_Write, /* Is under end_buffer_async_write I/O */
BH_JBD, /* Has an attached ext3 journal_head */
BH_PrivateStart,/* not a state bit, but the first bit available BH_PrivateStart,/* not a state bit, but the first bit available
* for private allocation by other entities * for private allocation by other entities
*/ */
...@@ -32,28 +32,22 @@ struct buffer_head; ...@@ -32,28 +32,22 @@ struct buffer_head;
typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
/* /*
* Try to keep the most commonly used fields in single cache lines (16 * Keep related fields in common cachelines. The most commonly accessed
* bytes) to improve performance. This ordering should be * field (b_state) goes at the start so the compiler does not generate
* particularly beneficial on 32-bit processors. * indexed addressing for it.
*
* We use the first 16 bytes for the data which is used in searches
* over the block hash lists (ie. getblk() and friends).
*
* The second 16 bytes we use for lru buffer scans, as used by
* sync_buffers() and refill_freelist(). -- sct
*/ */
struct buffer_head { struct buffer_head {
/* First cache line: */ /* First cache line: */
sector_t b_blocknr; /* block number */
unsigned short b_size; /* block size */
struct block_device *b_bdev;
atomic_t b_count; /* users using this block */
unsigned long b_state; /* buffer state bitmap (see above) */ unsigned long b_state; /* buffer state bitmap (see above) */
atomic_t b_count; /* users using this block */
struct buffer_head *b_this_page;/* circular list of page's buffers */ struct buffer_head *b_this_page;/* circular list of page's buffers */
struct page *b_page; /* the page this bh is mapped to */ struct page *b_page; /* the page this bh is mapped to */
char * b_data; /* pointer to data block */ sector_t b_blocknr; /* block number */
unsigned short b_size; /* block size */
char *b_data; /* pointer to data block */
struct block_device *b_bdev;
bh_end_io_t *b_end_io; /* I/O completion */ bh_end_io_t *b_end_io; /* I/O completion */
void *b_private; /* reserved for b_end_io */ void *b_private; /* reserved for b_end_io */
struct list_head b_inode_buffers; /* list of inode dirty buffers */ struct list_head b_inode_buffers; /* list of inode dirty buffers */
...@@ -91,6 +85,11 @@ static inline int test_clear_buffer_##name(struct buffer_head *bh) \ ...@@ -91,6 +85,11 @@ static inline int test_clear_buffer_##name(struct buffer_head *bh) \
return test_and_clear_bit(BH_##bit, &(bh)->b_state); \ return test_and_clear_bit(BH_##bit, &(bh)->b_state); \
} \ } \
/*
* Emit the buffer bitops functions. Note that there are also functions
* of the form "mark_buffer_foo()". These are higher-level functions which
* do something in addition to setting a b_state bit.
*/
BUFFER_FNS(Uptodate, uptodate) BUFFER_FNS(Uptodate, uptodate)
BUFFER_FNS(Dirty, dirty) BUFFER_FNS(Dirty, dirty)
TAS_BUFFER_FNS(Dirty, dirty) TAS_BUFFER_FNS(Dirty, dirty)
...@@ -99,15 +98,12 @@ TAS_BUFFER_FNS(Lock, locked) ...@@ -99,15 +98,12 @@ TAS_BUFFER_FNS(Lock, locked)
BUFFER_FNS(Req, req) BUFFER_FNS(Req, req)
BUFFER_FNS(Mapped, mapped) BUFFER_FNS(Mapped, mapped)
BUFFER_FNS(New, new) BUFFER_FNS(New, new)
BUFFER_FNS(Async, async) BUFFER_FNS(Async_Read, async_read)
BUFFER_FNS(Async_Write, async_write)
/*
* Utility macros
*/
/* /*
* FIXME: this is used only by bh_kmap, which is used only by RAID5. * FIXME: this is used only by bh_kmap, which is used only by RAID5.
* Clean this up with blockdev-in-highmem infrastructure. * Move all that stuff into raid5.c
*/ */
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
...@@ -152,8 +148,8 @@ void end_buffer_io_sync(struct buffer_head *bh, int uptodate); ...@@ -152,8 +148,8 @@ void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
void buffer_insert_list(spinlock_t *lock, void buffer_insert_list(spinlock_t *lock,
struct buffer_head *, struct list_head *); struct buffer_head *, struct list_head *);
/* reiserfs_writepage needs this */ void mark_buffer_async_read(struct buffer_head *bh);
void set_buffer_async_io(struct buffer_head *bh); void mark_buffer_async_write(struct buffer_head *bh);
void invalidate_inode_buffers(struct inode *); void invalidate_inode_buffers(struct inode *);
void invalidate_bdev(struct block_device *, int); void invalidate_bdev(struct block_device *, int);
void __invalidate_buffers(kdev_t dev, int); void __invalidate_buffers(kdev_t dev, int);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment