Commit 47bb09d8 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] JBD: fix race over access to b_committed_data

From: Alex Tomas <bzzz@tmi.comex.ru>

We have a race wherein the block allocator can decide that
journal_head.b_committed_data is present and then will use it.  But kjournald
can concurrently free it and set the pointer to NULL.  It goes oops.

We introduce per-buffer_head "spinlocking" based on a bit in b_state.  To do
this we abstract out pte_chain_lock() and reuse the implementation.

The bit-based spinlocking is pretty inefficient CPU-wise (hence the warning
in there) and we may move this to a hashed spinlock later.
parent 17aff938
...@@ -185,11 +185,14 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode, ...@@ -185,11 +185,14 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode,
if (err) if (err)
goto error_return; goto error_return;
jbd_lock_bh_state(bitmap_bh);
for (i = 0; i < count; i++) { for (i = 0; i < count; i++) {
/* /*
* An HJ special. This is expensive... * An HJ special. This is expensive...
*/ */
#ifdef CONFIG_JBD_DEBUG #ifdef CONFIG_JBD_DEBUG
jbd_unlock_bh_state(bitmap_bh);
{ {
struct buffer_head *debug_bh; struct buffer_head *debug_bh;
debug_bh = sb_find_get_block(sb, block + i); debug_bh = sb_find_get_block(sb, block + i);
...@@ -202,6 +205,7 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode, ...@@ -202,6 +205,7 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode,
__brelse(debug_bh); __brelse(debug_bh);
} }
} }
jbd_lock_bh_state(bitmap_bh);
#endif #endif
/* @@@ This prevents newly-allocated data from being /* @@@ This prevents newly-allocated data from being
* freed and then reallocated within the same * freed and then reallocated within the same
...@@ -243,6 +247,7 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode, ...@@ -243,6 +247,7 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode,
dquot_freed_blocks++; dquot_freed_blocks++;
} }
} }
jbd_unlock_bh_state(bitmap_bh);
spin_lock(sb_bgl_lock(sbi, block_group)); spin_lock(sb_bgl_lock(sbi, block_group));
gdp->bg_free_blocks_count = gdp->bg_free_blocks_count =
...@@ -289,11 +294,12 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode, ...@@ -289,11 +294,12 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode,
* data-writes at some point, and disable it for metadata allocations or * data-writes at some point, and disable it for metadata allocations or
* sync-data inodes. * sync-data inodes.
*/ */
static int ext3_test_allocatable(int nr, struct buffer_head *bh) static inline int ext3_test_allocatable(int nr, struct buffer_head *bh,
int have_access)
{ {
if (ext3_test_bit(nr, bh->b_data)) if (ext3_test_bit(nr, bh->b_data))
return 0; return 0;
if (!buffer_jbd(bh) || !bh2jh(bh)->b_committed_data) if (!have_access || !buffer_jbd(bh) || !bh2jh(bh)->b_committed_data)
return 1; return 1;
return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data); return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data);
} }
...@@ -305,8 +311,8 @@ static int ext3_test_allocatable(int nr, struct buffer_head *bh) ...@@ -305,8 +311,8 @@ static int ext3_test_allocatable(int nr, struct buffer_head *bh)
* the initial goal; then for a free byte somewhere in the bitmap; then * the initial goal; then for a free byte somewhere in the bitmap; then
* for any free bit in the bitmap. * for any free bit in the bitmap.
*/ */
static int find_next_usable_block(int start, static int find_next_usable_block(int start, struct buffer_head *bh,
struct buffer_head *bh, int maxblocks) int maxblocks, int have_access)
{ {
int here, next; int here, next;
char *p, *r; char *p, *r;
...@@ -322,7 +328,8 @@ static int find_next_usable_block(int start, ...@@ -322,7 +328,8 @@ static int find_next_usable_block(int start,
*/ */
int end_goal = (start + 63) & ~63; int end_goal = (start + 63) & ~63;
here = ext3_find_next_zero_bit(bh->b_data, end_goal, start); here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
if (here < end_goal && ext3_test_allocatable(here, bh)) if (here < end_goal &&
ext3_test_allocatable(here, bh, have_access))
return here; return here;
ext3_debug ("Bit not found near goal\n"); ext3_debug ("Bit not found near goal\n");
...@@ -345,7 +352,7 @@ static int find_next_usable_block(int start, ...@@ -345,7 +352,7 @@ static int find_next_usable_block(int start,
r = memscan(p, 0, (maxblocks - here + 7) >> 3); r = memscan(p, 0, (maxblocks - here + 7) >> 3);
next = (r - ((char *) bh->b_data)) << 3; next = (r - ((char *) bh->b_data)) << 3;
if (next < maxblocks && ext3_test_allocatable(next, bh)) if (next < maxblocks && ext3_test_allocatable(next, bh, have_access))
return next; return next;
/* The bitmap search --- search forward alternately /* The bitmap search --- search forward alternately
...@@ -357,10 +364,10 @@ static int find_next_usable_block(int start, ...@@ -357,10 +364,10 @@ static int find_next_usable_block(int start,
maxblocks, here); maxblocks, here);
if (next >= maxblocks) if (next >= maxblocks)
return -1; return -1;
if (ext3_test_allocatable(next, bh)) if (ext3_test_allocatable(next, bh, have_access))
return next; return next;
J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data); if (have_access)
here = ext3_find_next_zero_bit here = ext3_find_next_zero_bit
((unsigned long *) bh2jh(bh)->b_committed_data, ((unsigned long *) bh2jh(bh)->b_committed_data,
maxblocks, next); maxblocks, next);
...@@ -402,17 +409,18 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, ...@@ -402,17 +409,18 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
*errp = 0; *errp = 0;
if (goal >= 0 && ext3_test_allocatable(goal, bitmap_bh)) if (goal >= 0 && ext3_test_allocatable(goal, bitmap_bh, 0))
goto got; goto got;
repeat: repeat:
goal = find_next_usable_block(goal, bitmap_bh, goal = find_next_usable_block(goal, bitmap_bh,
EXT3_BLOCKS_PER_GROUP(sb)); EXT3_BLOCKS_PER_GROUP(sb), have_access);
if (goal < 0) if (goal < 0)
goto fail; goto fail;
for (i = 0; for (i = 0;
i < 7 && goal > 0 && ext3_test_allocatable(goal - 1, bitmap_bh); i < 7 && goal > 0 &&
ext3_test_allocatable(goal - 1, bitmap_bh, have_access);
i++, goal--); i++, goal--);
got: got:
...@@ -429,6 +437,7 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, ...@@ -429,6 +437,7 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
*errp = fatal; *errp = fatal;
goto fail; goto fail;
} }
jbd_lock_bh_state(bitmap_bh);
have_access = 1; have_access = 1;
} }
...@@ -444,6 +453,7 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, ...@@ -444,6 +453,7 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
} }
BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for bitmap block"); BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for bitmap block");
jbd_unlock_bh_state(bitmap_bh);
fatal = ext3_journal_dirty_metadata(handle, bitmap_bh); fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
if (fatal) { if (fatal) {
*errp = fatal; *errp = fatal;
...@@ -454,6 +464,7 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, ...@@ -454,6 +464,7 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
fail: fail:
if (have_access) { if (have_access) {
BUFFER_TRACE(bitmap_bh, "journal_release_buffer"); BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
jbd_unlock_bh_state(bitmap_bh);
ext3_journal_release_buffer(handle, bitmap_bh); ext3_journal_release_buffer(handle, bitmap_bh);
} }
return -1; return -1;
...@@ -611,14 +622,19 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal, ...@@ -611,14 +622,19 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
brelse(debug_bh); brelse(debug_bh);
} }
} }
#endif jbd_lock_bh_state(bitmap_bh);
spin_lock(sb_bgl_lock(sbi, group_no)); spin_lock(sb_bgl_lock(sbi, group_no));
if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
J_ASSERT_BH(bitmap_bh, if (ext3_test_bit(ret_block,
!ext3_test_bit(ret_block, bh2jh(bitmap_bh)->b_committed_data)) {
bh2jh(bitmap_bh)->b_committed_data)); printk("%s: block was unexpectedly set in "
"b_committed_data\n", __FUNCTION__);
}
}
ext3_debug("found bit %d\n", ret_block); ext3_debug("found bit %d\n", ret_block);
spin_unlock(sb_bgl_lock(sbi, group_no)); spin_unlock(sb_bgl_lock(sbi, group_no));
jbd_unlock_bh_state(bitmap_bh);
#endif
/* ret_block was blockgroup-relative. Now it becomes fs-relative */ /* ret_block was blockgroup-relative. Now it becomes fs-relative */
ret_block = target_block; ret_block = target_block;
......
...@@ -640,6 +640,7 @@ void journal_commit_transaction(journal_t *journal) ...@@ -640,6 +640,7 @@ void journal_commit_transaction(journal_t *journal)
* *
* Otherwise, we can just throw away the frozen data now. * Otherwise, we can just throw away the frozen data now.
*/ */
jbd_lock_bh_state(jh2bh(jh));
if (jh->b_committed_data) { if (jh->b_committed_data) {
kfree(jh->b_committed_data); kfree(jh->b_committed_data);
jh->b_committed_data = NULL; jh->b_committed_data = NULL;
...@@ -651,6 +652,7 @@ void journal_commit_transaction(journal_t *journal) ...@@ -651,6 +652,7 @@ void journal_commit_transaction(journal_t *journal)
kfree(jh->b_frozen_data); kfree(jh->b_frozen_data);
jh->b_frozen_data = NULL; jh->b_frozen_data = NULL;
} }
jbd_unlock_bh_state(jh2bh(jh));
spin_lock(&journal_datalist_lock); spin_lock(&journal_datalist_lock);
cp_transaction = jh->b_cp_transaction; cp_transaction = jh->b_cp_transaction;
......
...@@ -292,6 +292,7 @@ enum jbd_state_bits { ...@@ -292,6 +292,7 @@ enum jbd_state_bits {
BH_Revoked, /* Has been revoked from the log */ BH_Revoked, /* Has been revoked from the log */
BH_RevokeValid, /* Revoked flag is valid */ BH_RevokeValid, /* Revoked flag is valid */
BH_JBDDirty, /* Is dirty but journaled */ BH_JBDDirty, /* Is dirty but journaled */
BH_State, /* Pins most journal_head state */
}; };
BUFFER_FNS(JBD, jbd) BUFFER_FNS(JBD, jbd)
...@@ -309,6 +310,21 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh) ...@@ -309,6 +310,21 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
return bh->b_private; return bh->b_private;
} }
static inline void jbd_lock_bh_state(struct buffer_head *bh)
{
bit_spin_lock(BH_State, &bh->b_state);
}
static inline int jbd_trylock_bh_state(struct buffer_head *bh)
{
return bit_spin_trylock(BH_State, &bh->b_state);
}
static inline void jbd_unlock_bh_state(struct buffer_head *bh)
{
bit_spin_unlock(BH_State, &bh->b_state);
}
#define HAVE_JOURNAL_CALLBACK_STATUS #define HAVE_JOURNAL_CALLBACK_STATUS
/** /**
* struct journal_callback - Base structure for callback information. * struct journal_callback - Base structure for callback information.
......
...@@ -10,32 +10,8 @@ ...@@ -10,32 +10,8 @@
struct pte_chain; struct pte_chain;
extern kmem_cache_t *pte_chain_cache; extern kmem_cache_t *pte_chain_cache;
static inline void pte_chain_lock(struct page *page) #define pte_chain_lock(page) bit_spin_lock(PG_chainlock, &page->flags)
{ #define pte_chain_unlock(page) bit_spin_unlock(PG_chainlock, &page->flags)
/*
* Assuming the lock is uncontended, this never enters
* the body of the outer loop. If it is contended, then
* within the inner loop a non-atomic test is used to
* busywait with less bus contention for a good time to
* attempt to acquire the lock bit.
*/
preempt_disable();
#ifdef CONFIG_SMP
while (test_and_set_bit(PG_chainlock, &page->flags)) {
while (test_bit(PG_chainlock, &page->flags))
cpu_relax();
}
#endif
}
static inline void pte_chain_unlock(struct page *page)
{
#ifdef CONFIG_SMP
smp_mb__before_clear_bit();
clear_bit(PG_chainlock, &page->flags);
#endif
preempt_enable();
}
struct pte_chain *pte_chain_alloc(int gfp_flags); struct pte_chain *pte_chain_alloc(int gfp_flags);
void __pte_chain_free(struct pte_chain *pte_chain); void __pte_chain_free(struct pte_chain *pte_chain);
......
...@@ -395,4 +395,74 @@ do { \ ...@@ -395,4 +395,74 @@ do { \
extern int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); extern int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
#endif #endif
/*
* bit-based spin_lock()
*
* Don't use this unless you really need to: spin_lock() and spin_unlock()
* are significantly faster.
*/
static inline void bit_spin_lock(int bitnum, unsigned long *addr)
{
/*
* Assuming the lock is uncontended, this never enters
* the body of the outer loop. If it is contended, then
* within the inner loop a non-atomic test is used to
* busywait with less bus contention for a good time to
* attempt to acquire the lock bit.
*/
preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
while (test_and_set_bit(bitnum, addr)) {
while (test_bit(bitnum, addr))
cpu_relax();
}
#endif
}
/*
* Return true if it was acquired
*/
static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
{
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
int ret;
preempt_disable();
ret = !test_and_set_bit(bitnum, addr);
if (!ret)
preempt_enable();
return ret;
#else
preempt_disable();
return 1;
#endif
}
/*
* bit-based spin_unlock()
*/
static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
{
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
BUG_ON(!test_bit(bitnum, addr));
smp_mb__before_clear_bit();
clear_bit(bitnum, addr);
#endif
preempt_enable();
}
/*
* Return true if the lock is held.
*/
static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
{
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
return test_bit(bitnum, addr);
#elif defined CONFIG_PREEMPT
return preempt_count();
#else
return 1;
#endif
}
#endif /* __LINUX_SPINLOCK_H */ #endif /* __LINUX_SPINLOCK_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment