Commit cd9ab8c2 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] ext3: fix scheduling storm and lockups

There have been sporadic sightings of ext3 causing little blips of 100,000
context switches per second when under load.

At the start of do_get_write_access() we have this logic:

	repeat:
		lock_buffer(jh->bh);
		...
		unlock_buffer(jh->bh);
		...
		if (jh->j_list == BJ_Shadow) {
			sleep_on_buffer(jh->bh);
			goto repeat;
		}

The problem is that the unlock_buffer() will wake up anyone who is sleeping
in the sleep_on_buffer().

So if task A is asleep in sleep_on_buffer() and task B now runs
do_get_write_access(), task B will wake task A by accident.  Task B will then
sleep on the buffer and task A will loop, will run unlock_buffer() and then
wake task B.

This state will continue until I/O completes against the buffer and kjournal
changes jh->j_list.

Unless task A and task B happen to both have realtime scheduling policy - if
they do then kjournald will never run.  The state is never cleared and your
box locks up.


The fix is to not do the `goto repeat;' until the buffer has been taken of
the shadow list.  So we don't go and wake up the other waiter(s) until they
can actually proceed to use the buffer.

The patch removes the exported sleep_on_buffer() function and simply exports
an existing function which provides access to a buffer_head's waitqueue
pointer.  Which is a better interface anyway, because it permits the use of
wait_event().

This bug was introduced introduced into 2.4.20-pre5 and was faithfully ported
up.
parent 2ef0192c
...@@ -81,21 +81,11 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) ...@@ -81,21 +81,11 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
* Return the address of the waitqueue_head to be used for this * Return the address of the waitqueue_head to be used for this
* buffer_head * buffer_head
*/ */
static wait_queue_head_t *bh_waitq_head(struct buffer_head *bh) wait_queue_head_t *bh_waitq_head(struct buffer_head *bh)
{ {
return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh; return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh;
} }
EXPORT_SYMBOL(bh_waitq_head);
/*
* Wait on a buffer until someone does a wakeup on it. Needs
* lots of external locking. ext3 uses this. Fix it.
*/
void sleep_on_buffer(struct buffer_head *bh)
{
wait_queue_head_t *wq = bh_waitq_head(bh);
sleep_on(wq);
}
EXPORT_SYMBOL(sleep_on_buffer);
void wake_up_buffer(struct buffer_head *bh) void wake_up_buffer(struct buffer_head *bh)
{ {
......
...@@ -689,11 +689,14 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) ...@@ -689,11 +689,14 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy)
* disk then we cannot do copy-out here. */ * disk then we cannot do copy-out here. */
if (jh->b_jlist == BJ_Shadow) { if (jh->b_jlist == BJ_Shadow) {
wait_queue_head_t *wqh;
JBUFFER_TRACE(jh, "on shadow: sleep"); JBUFFER_TRACE(jh, "on shadow: sleep");
spin_unlock(&journal_datalist_lock); spin_unlock(&journal_datalist_lock);
unlock_journal(journal); unlock_journal(journal);
/* commit wakes up all shadow buffers after IO */ /* commit wakes up all shadow buffers after IO */
sleep_on_buffer(jh2bh(jh)); wqh = bh_waitq_head(jh2bh(jh));
wait_event(*wqh, (jh->b_jlist != BJ_Shadow));
lock_journal(journal); lock_journal(journal);
goto repeat; goto repeat;
} }
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/linkage.h> #include <linux/linkage.h>
#include <linux/wait.h>
#include <asm/atomic.h> #include <asm/atomic.h>
enum bh_state_bits { enum bh_state_bits {
...@@ -154,7 +155,7 @@ void invalidate_bdev(struct block_device *, int); ...@@ -154,7 +155,7 @@ void invalidate_bdev(struct block_device *, int);
void __invalidate_buffers(kdev_t dev, int); void __invalidate_buffers(kdev_t dev, int);
int sync_blockdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev);
void __wait_on_buffer(struct buffer_head *); void __wait_on_buffer(struct buffer_head *);
void sleep_on_buffer(struct buffer_head *bh); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
void wake_up_buffer(struct buffer_head *bh); void wake_up_buffer(struct buffer_head *bh);
int fsync_bdev(struct block_device *); int fsync_bdev(struct block_device *);
int fsync_super(struct super_block *); int fsync_super(struct super_block *);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment