Commit 47bb09d8 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] JBD: fix race over access to b_committed_data

From: Alex Tomas <bzzz@tmi.comex.ru>

We have a race wherein the block allocator can decide that
journal_head.b_committed_data is present and then will use it.  But kjournald
can concurrently free it and set the pointer to NULL.  It goes oops.

We introduce per-buffer_head "spinlocking" based on a bit in b_state.  To do
this we abstract out pte_chain_lock() and reuse the implementation.

The bit-based spinlocking is pretty inefficient CPU-wise (hence the warning
in there) and we may move this to a hashed spinlock later.
parent 17aff938
......@@ -185,11 +185,14 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode,
if (err)
goto error_return;
jbd_lock_bh_state(bitmap_bh);
for (i = 0; i < count; i++) {
/*
* An HJ special. This is expensive...
*/
#ifdef CONFIG_JBD_DEBUG
jbd_unlock_bh_state(bitmap_bh);
{
struct buffer_head *debug_bh;
debug_bh = sb_find_get_block(sb, block + i);
......@@ -202,6 +205,7 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode,
__brelse(debug_bh);
}
}
jbd_lock_bh_state(bitmap_bh);
#endif
/* @@@ This prevents newly-allocated data from being
* freed and then reallocated within the same
......@@ -243,6 +247,7 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode,
dquot_freed_blocks++;
}
}
jbd_unlock_bh_state(bitmap_bh);
spin_lock(sb_bgl_lock(sbi, block_group));
gdp->bg_free_blocks_count =
......@@ -289,11 +294,12 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode,
* data-writes at some point, and disable it for metadata allocations or
* sync-data inodes.
*/
static int ext3_test_allocatable(int nr, struct buffer_head *bh)
static inline int ext3_test_allocatable(int nr, struct buffer_head *bh,
int have_access)
{
if (ext3_test_bit(nr, bh->b_data))
return 0;
if (!buffer_jbd(bh) || !bh2jh(bh)->b_committed_data)
if (!have_access || !buffer_jbd(bh) || !bh2jh(bh)->b_committed_data)
return 1;
return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data);
}
......@@ -305,8 +311,8 @@ static int ext3_test_allocatable(int nr, struct buffer_head *bh)
* the initial goal; then for a free byte somewhere in the bitmap; then
* for any free bit in the bitmap.
*/
static int find_next_usable_block(int start,
struct buffer_head *bh, int maxblocks)
static int find_next_usable_block(int start, struct buffer_head *bh,
int maxblocks, int have_access)
{
int here, next;
char *p, *r;
......@@ -322,7 +328,8 @@ static int find_next_usable_block(int start,
*/
int end_goal = (start + 63) & ~63;
here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
if (here < end_goal && ext3_test_allocatable(here, bh))
if (here < end_goal &&
ext3_test_allocatable(here, bh, have_access))
return here;
ext3_debug ("Bit not found near goal\n");
......@@ -345,7 +352,7 @@ static int find_next_usable_block(int start,
r = memscan(p, 0, (maxblocks - here + 7) >> 3);
next = (r - ((char *) bh->b_data)) << 3;
if (next < maxblocks && ext3_test_allocatable(next, bh))
if (next < maxblocks && ext3_test_allocatable(next, bh, have_access))
return next;
/* The bitmap search --- search forward alternately
......@@ -357,13 +364,13 @@ static int find_next_usable_block(int start,
maxblocks, here);
if (next >= maxblocks)
return -1;
if (ext3_test_allocatable(next, bh))
if (ext3_test_allocatable(next, bh, have_access))
return next;
J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data);
here = ext3_find_next_zero_bit
((unsigned long *) bh2jh(bh)->b_committed_data,
maxblocks, next);
if (have_access)
here = ext3_find_next_zero_bit
((unsigned long *) bh2jh(bh)->b_committed_data,
maxblocks, next);
}
return -1;
}
......@@ -402,17 +409,18 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
*errp = 0;
if (goal >= 0 && ext3_test_allocatable(goal, bitmap_bh))
if (goal >= 0 && ext3_test_allocatable(goal, bitmap_bh, 0))
goto got;
repeat:
goal = find_next_usable_block(goal, bitmap_bh,
EXT3_BLOCKS_PER_GROUP(sb));
EXT3_BLOCKS_PER_GROUP(sb), have_access);
if (goal < 0)
goto fail;
for (i = 0;
i < 7 && goal > 0 && ext3_test_allocatable(goal - 1, bitmap_bh);
i < 7 && goal > 0 &&
ext3_test_allocatable(goal - 1, bitmap_bh, have_access);
i++, goal--);
got:
......@@ -429,6 +437,7 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
*errp = fatal;
goto fail;
}
jbd_lock_bh_state(bitmap_bh);
have_access = 1;
}
......@@ -444,6 +453,7 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
}
BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for bitmap block");
jbd_unlock_bh_state(bitmap_bh);
fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
if (fatal) {
*errp = fatal;
......@@ -454,6 +464,7 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
fail:
if (have_access) {
BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
jbd_unlock_bh_state(bitmap_bh);
ext3_journal_release_buffer(handle, bitmap_bh);
}
return -1;
......@@ -611,14 +622,19 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
brelse(debug_bh);
}
}
#endif
jbd_lock_bh_state(bitmap_bh);
spin_lock(sb_bgl_lock(sbi, group_no));
if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data)
J_ASSERT_BH(bitmap_bh,
!ext3_test_bit(ret_block,
bh2jh(bitmap_bh)->b_committed_data));
if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
if (ext3_test_bit(ret_block,
bh2jh(bitmap_bh)->b_committed_data)) {
printk("%s: block was unexpectedly set in "
"b_committed_data\n", __FUNCTION__);
}
}
ext3_debug("found bit %d\n", ret_block);
spin_unlock(sb_bgl_lock(sbi, group_no));
jbd_unlock_bh_state(bitmap_bh);
#endif
/* ret_block was blockgroup-relative. Now it becomes fs-relative */
ret_block = target_block;
......
......@@ -640,6 +640,7 @@ void journal_commit_transaction(journal_t *journal)
*
* Otherwise, we can just throw away the frozen data now.
*/
jbd_lock_bh_state(jh2bh(jh));
if (jh->b_committed_data) {
kfree(jh->b_committed_data);
jh->b_committed_data = NULL;
......@@ -651,6 +652,7 @@ void journal_commit_transaction(journal_t *journal)
kfree(jh->b_frozen_data);
jh->b_frozen_data = NULL;
}
jbd_unlock_bh_state(jh2bh(jh));
spin_lock(&journal_datalist_lock);
cp_transaction = jh->b_cp_transaction;
......
......@@ -292,6 +292,7 @@ enum jbd_state_bits {
BH_Revoked, /* Has been revoked from the log */
BH_RevokeValid, /* Revoked flag is valid */
BH_JBDDirty, /* Is dirty but journaled */
BH_State, /* Pins most journal_head state */
};
BUFFER_FNS(JBD, jbd)
......@@ -309,6 +310,21 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
return bh->b_private;
}
static inline void jbd_lock_bh_state(struct buffer_head *bh)
{
bit_spin_lock(BH_State, &bh->b_state);
}
static inline int jbd_trylock_bh_state(struct buffer_head *bh)
{
return bit_spin_trylock(BH_State, &bh->b_state);
}
static inline void jbd_unlock_bh_state(struct buffer_head *bh)
{
bit_spin_unlock(BH_State, &bh->b_state);
}
#define HAVE_JOURNAL_CALLBACK_STATUS
/**
* struct journal_callback - Base structure for callback information.
......
......@@ -10,32 +10,8 @@
struct pte_chain;
extern kmem_cache_t *pte_chain_cache;
static inline void pte_chain_lock(struct page *page)
{
/*
* Assuming the lock is uncontended, this never enters
* the body of the outer loop. If it is contended, then
* within the inner loop a non-atomic test is used to
* busywait with less bus contention for a good time to
* attempt to acquire the lock bit.
*/
preempt_disable();
#ifdef CONFIG_SMP
while (test_and_set_bit(PG_chainlock, &page->flags)) {
while (test_bit(PG_chainlock, &page->flags))
cpu_relax();
}
#endif
}
static inline void pte_chain_unlock(struct page *page)
{
#ifdef CONFIG_SMP
smp_mb__before_clear_bit();
clear_bit(PG_chainlock, &page->flags);
#endif
preempt_enable();
}
#define pte_chain_lock(page) bit_spin_lock(PG_chainlock, &page->flags)
#define pte_chain_unlock(page) bit_spin_unlock(PG_chainlock, &page->flags)
struct pte_chain *pte_chain_alloc(int gfp_flags);
void __pte_chain_free(struct pte_chain *pte_chain);
......
......@@ -395,4 +395,74 @@ do { \
extern int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
#endif
/*
* bit-based spin_lock()
*
* Don't use this unless you really need to: spin_lock() and spin_unlock()
* are significantly faster.
*/
static inline void bit_spin_lock(int bitnum, unsigned long *addr)
{
/*
* Assuming the lock is uncontended, this never enters
* the body of the outer loop. If it is contended, then
* within the inner loop a non-atomic test is used to
* busywait with less bus contention for a good time to
* attempt to acquire the lock bit.
*/
preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
while (test_and_set_bit(bitnum, addr)) {
while (test_bit(bitnum, addr))
cpu_relax();
}
#endif
}
/*
* Return true if it was acquired
*/
static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
{
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
int ret;
preempt_disable();
ret = !test_and_set_bit(bitnum, addr);
if (!ret)
preempt_enable();
return ret;
#else
preempt_disable();
return 1;
#endif
}
/*
* bit-based spin_unlock()
*/
static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
{
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
BUG_ON(!test_bit(bitnum, addr));
smp_mb__before_clear_bit();
clear_bit(bitnum, addr);
#endif
preempt_enable();
}
/*
* Return true if the lock is held.
*/
static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
{
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
return test_bit(bitnum, addr);
#elif defined CONFIG_PREEMPT
return preempt_count();
#else
return 1;
#endif
}
#endif /* __LINUX_SPINLOCK_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment