Commit 2a6cb303 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] fix ext3 memory leak

This is the leak which Con found.  Long story...

- If a dirty page is fed into ext3_writepage() during truncate,
  block_write_full_page() will reutrn -EIO (it's outside i_size) and will
  leave the buffers dirty.  In the expectation that discard_buffer() will
  clean them.

- ext3_writepage() then adds the still-dirty buffers to the journal's
  "async data list".  These are buffers which are known to have had IO
  started.  All we need to do is to wait on them in commit.

- meanwhile, truncate will chop the pages off the address_space.  But
  truncate cannot invalidate the buffers (in journal_unmap_buffer()) because
  the buffers are attached to the committing transaction.  (hm.  This
  behaviour in journal_unmap_buffer() is bogus.  We just never need to write
  these buffers.)

- ext3 commit will "wait on writeout" of these writepage buffers (even
  though it was never started) and will then release them from the
  journalling system.

So we end up with pages which are attached to no mapping, which are clean and
which have dirty buffers.  These are unreclaimable.


Aside:

  ext3-ordered has two buffer lists: the "sync data list" and the "async
  data list".

  The sync list consists of probably-dirty buffers which were dirtied in
  commit_write().  Transaction commit must write all thee out and wait on
  them.

  The async list supposedly consists of clean buffers which were attached
  to the journal in ->writepage.  These have had IO started (by writepage) so
  commit merely needs to wait on them.

  This is all designed for the 2.4 VM really.  In 2.5, tons of writeback
  goes via writepage (instead of the buffer lru) and these buffers end up
  madly hpooing between the async and sync lists.

  Plus it's arguably incorrect to just wait on the writes in commit - if
  the buffers were set dirty again (say, by zap_pte_range()) then perhaps we
  should write them again before committing.


So what the patch does is to remove the async list.  All ordered-data buffers
are now attached to the single "sync data list".  So when we come to commit,
those buffers which are dirty will have IO started and all buffers are waited
upon.

This means that the dirty buffers against a clean page which came about from
block_write_full_page()'s -EIO will be written to disk in commit - this
cleans them, and the page is now reclaimable.  No leak.

It seems bogus to write these buffers in commit, and indeed it is.  But ext3
will not allow those blocks to be reused until the commit has ended so there
is no corruption risk.  And the amount of data involved is low - it only
comes about as a race between truncate and writepage().
parent daebe5ee
......@@ -1098,20 +1098,14 @@ static int ext3_prepare_write(struct file *file, struct page *page,
return ret;
}
static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh)
{
return ext3_journal_dirty_data(handle, bh, 0);
}
/*
* For ext3_writepage(). We also brelse() the buffer to account for
* the bget() which ext3_writepage() performs.
*/
static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh)
static int
ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
{
int ret = ext3_journal_dirty_data(handle, bh, 1);
__brelse(bh);
return ret;
int err = journal_dirty_data(handle, bh);
if (err)
ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
bh, handle,err);
return err;
}
/* For commit_write() in data=journal mode */
......@@ -1154,7 +1148,7 @@ static int ext3_commit_write(struct file *file, struct page *page,
} else {
if (ext3_should_order_data(inode)) {
ret = walk_page_buffers(handle, page_buffers(page),
from, to, NULL, journal_dirty_sync_data);
from, to, NULL, ext3_journal_dirty_data);
}
/* Be careful here if generic_commit_write becomes a
* required invocation after block_prepare_write. */
......@@ -1228,7 +1222,13 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
static int bget_one(handle_t *handle, struct buffer_head *bh)
{
atomic_inc(&bh->b_count);
get_bh(bh);
return 0;
}
static int bput_one(handle_t *handle, struct buffer_head *bh)
{
put_bh(bh);
return 0;
}
......@@ -1348,7 +1348,9 @@ static int ext3_writepage(struct page *page, struct writeback_control *wbc)
/* And attach them to the current transaction */
if (order_data) {
err = walk_page_buffers(handle, page_bufs,
0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data);
0, PAGE_CACHE_SIZE, NULL, ext3_journal_dirty_data);
walk_page_buffers(handle, page_bufs, 0,
PAGE_CACHE_SIZE, NULL, bput_one);
if (!ret)
ret = err;
}
......@@ -1587,7 +1589,7 @@ static int ext3_block_truncate_page(handle_t *handle,
err = ext3_journal_dirty_metadata(handle, bh);
} else {
if (ext3_should_order_data(inode))
err = ext3_journal_dirty_data(handle, bh, 0);
err = ext3_journal_dirty_data(handle, bh);
mark_buffer_dirty(bh);
}
......
......@@ -585,7 +585,6 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
J_ASSERT (transaction->t_ilist == NULL);
J_ASSERT (transaction->t_buffers == NULL);
J_ASSERT (transaction->t_sync_datalist == NULL);
J_ASSERT (transaction->t_async_datalist == NULL);
J_ASSERT (transaction->t_forget == NULL);
J_ASSERT (transaction->t_iobuf_list == NULL);
J_ASSERT (transaction->t_shadow_list == NULL);
......
......@@ -264,37 +264,6 @@ void journal_commit_transaction(journal_t *journal)
goto write_out_data_locked;
sync_datalist_empty:
/*
* Wait for all the async writepage data. As they become unlocked
* in end_buffer_async_write(), the only place where they can be
* reaped is in try_to_free_buffers(), and we're locked against
* that.
*/
while ((jh = commit_transaction->t_async_datalist)) {
struct buffer_head *bh = jh2bh(jh);
if (buffer_locked(bh)) {
spin_unlock(&journal_datalist_lock);
unlock_journal(journal);
wait_on_buffer(bh);
lock_journal(journal);
spin_lock(&journal_datalist_lock);
continue; /* List may have changed */
}
if (jh->b_next_transaction) {
/*
* For writepage() buffers in journalled data mode: a
* later transaction may want the buffer for "metadata"
*/
__journal_refile_buffer(jh);
} else {
BUFFER_TRACE(bh, "finished async writeout: unfile");
__journal_unfile_buffer(jh);
jh->b_transaction = NULL;
__journal_remove_journal_head(bh);
BUFFER_TRACE(bh, "finished async writeout: refile");
__brelse(bh);
}
}
spin_unlock(&journal_datalist_lock);
/*
......@@ -304,7 +273,6 @@ void journal_commit_transaction(journal_t *journal)
* clean by now, so check that it is in fact empty.
*/
J_ASSERT (commit_transaction->t_sync_datalist == NULL);
J_ASSERT (commit_transaction->t_async_datalist == NULL);
jbd_debug (3, "JBD: commit phase 3\n");
......@@ -629,7 +597,6 @@ void journal_commit_transaction(journal_t *journal)
jbd_debug(3, "JBD: commit phase 7\n");
J_ASSERT(commit_transaction->t_sync_datalist == NULL);
J_ASSERT(commit_transaction->t_async_datalist == NULL);
J_ASSERT(commit_transaction->t_buffers == NULL);
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
J_ASSERT(commit_transaction->t_iobuf_list == NULL);
......
......@@ -578,9 +578,6 @@ static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
* part of the transaction, that is).
*
* Returns an error code or 0 on success.
*
* In full data journalling mode the buffer may be of type BJ_AsyncData,
* because we're write()ing a buffer which is also part of a shared mapping.
*/
static int
......@@ -949,26 +946,16 @@ int journal_get_undo_access (handle_t *handle, struct buffer_head *bh)
* The buffer is placed on the transaction's data list and is marked as
* belonging to the transaction.
*
* If `async' is set then the writebabk will be initiated by the caller
* using submit_bh -> end_buffer_async_write. We put the buffer onto
* t_async_datalist.
*
* Returns error number or 0 on success.
*
* journal_dirty_data() can be called via page_launder->ext3_writepage
* by kswapd. So it cannot block. Happily, there's nothing here
* which needs lock_journal if `async' is set.
*
* When the buffer is on the current transaction we freely move it
* between BJ_AsyncData and BJ_SyncData according to who tried to
* change its state last.
*/
int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async)
int journal_dirty_data (handle_t *handle, struct buffer_head *bh)
{
journal_t *journal = handle->h_transaction->t_journal;
int need_brelse = 0;
int wanted_jlist = async ? BJ_AsyncData : BJ_SyncData;
struct journal_head *jh;
if (is_handle_aborted(handle))
......@@ -1046,8 +1033,7 @@ int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async)
* the write() data.
*/
if (jh->b_jlist != BJ_None &&
jh->b_jlist != BJ_SyncData &&
jh->b_jlist != BJ_AsyncData) {
jh->b_jlist != BJ_SyncData) {
JBUFFER_TRACE(jh, "Not stealing");
goto no_journal;
}
......@@ -1058,7 +1044,7 @@ int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async)
* again because that can cause the write-out loop in
* commit to never terminate.
*/
if (!async && buffer_dirty(bh)) {
if (buffer_dirty(bh)) {
atomic_inc(&bh->b_count);
spin_unlock(&journal_datalist_lock);
need_brelse = 1;
......@@ -1084,18 +1070,18 @@ int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async)
* committing transaction, so might still be left on that
* transaction's metadata lists.
*/
if (jh->b_jlist != wanted_jlist) {
if (jh->b_jlist != BJ_SyncData) {
JBUFFER_TRACE(jh, "not on correct data list: unfile");
J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
__journal_unfile_buffer(jh);
jh->b_transaction = NULL;
JBUFFER_TRACE(jh, "file as data");
__journal_file_buffer(jh, handle->h_transaction,
wanted_jlist);
BJ_SyncData);
}
} else {
JBUFFER_TRACE(jh, "not on a transaction");
__journal_file_buffer(jh, handle->h_transaction, wanted_jlist);
__journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
}
no_journal:
spin_unlock(&journal_datalist_lock);
......@@ -1559,12 +1545,12 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
* Remove a buffer from the appropriate transaction list.
*
* Note that this function can *change* the value of
* bh->b_transaction->t_sync_datalist, t_async_datalist, t_buffers, t_forget,
* bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
* t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
* is holding onto a copy of one of thee pointers, it could go bad.
* Generally the caller needs to re-read the pointer from the transaction_t.
*
* If bh->b_jlist is BJ_SyncData or BJ_AsyncData then we may have been called
* If bh->b_jlist is BJ_SyncData then we may have been called
* via journal_try_to_free_buffer() or journal_clean_data_list(). In that
* case, journal_datalist_lock will be held, and the journal may not be locked.
*/
......@@ -1590,9 +1576,6 @@ void __journal_unfile_buffer(struct journal_head *jh)
case BJ_SyncData:
list = &transaction->t_sync_datalist;
break;
case BJ_AsyncData:
list = &transaction->t_async_datalist;
break;
case BJ_Metadata:
transaction->t_nr_buffers--;
J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
......@@ -1658,7 +1641,7 @@ static inline int __journal_try_to_free_buffer(struct buffer_head *bh)
goto out;
if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
if (jh->b_jlist == BJ_SyncData || jh->b_jlist==BJ_AsyncData) {
if (jh->b_jlist == BJ_SyncData) {
/* A written-back ordered data buffer */
JBUFFER_TRACE(jh, "release data");
__journal_unfile_buffer(jh);
......@@ -1994,9 +1977,6 @@ void __journal_file_buffer(struct journal_head *jh,
case BJ_SyncData:
list = &transaction->t_sync_datalist;
break;
case BJ_AsyncData:
list = &transaction->t_async_datalist;
break;
case BJ_Metadata:
transaction->t_nr_buffers++;
list = &transaction->t_buffers;
......
......@@ -132,16 +132,6 @@ __ext3_journal_get_write_access(const char *where,
return err;
}
static inline int
__ext3_journal_dirty_data(const char *where,
handle_t *handle, struct buffer_head *bh, int async)
{
int err = journal_dirty_data(handle, bh, async);
if (err)
ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
return err;
}
static inline void
ext3_journal_forget(handle_t *handle, struct buffer_head *bh)
{
......@@ -183,8 +173,6 @@ __ext3_journal_dirty_metadata(const char *where,
__ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh))
#define ext3_journal_get_write_access(handle, bh) \
__ext3_journal_get_write_access(__FUNCTION__, (handle), (bh))
#define ext3_journal_dirty_data(handle, bh, async) \
__ext3_journal_dirty_data(__FUNCTION__, (handle), (bh), (async))
#define ext3_journal_revoke(handle, blocknr, bh) \
__ext3_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
#define ext3_journal_get_create_access(handle, bh) \
......
......@@ -360,13 +360,6 @@ struct transaction_s
*/
struct journal_head * t_sync_datalist;
/*
* Doubly-linked circular list of all writepage data buffers
* still to be written before this transaction can be committed.
* Protected by journal_datalist_lock.
*/
struct journal_head * t_async_datalist;
/* Doubly-linked circular list of all forget buffers (superseded
buffers which we can un-checkpoint once this transaction
commits) */
......@@ -654,8 +647,7 @@ extern int journal_extend (handle_t *, int nblocks);
extern int journal_get_write_access (handle_t *, struct buffer_head *);
extern int journal_get_create_access (handle_t *, struct buffer_head *);
extern int journal_get_undo_access (handle_t *, struct buffer_head *);
extern int journal_dirty_data (handle_t *,
struct buffer_head *, int async);
extern int journal_dirty_data (handle_t *, struct buffer_head *);
extern int journal_dirty_metadata (handle_t *, struct buffer_head *);
extern void journal_release_buffer (handle_t *, struct buffer_head *);
extern void journal_forget (handle_t *, struct buffer_head *);
......@@ -806,14 +798,13 @@ extern int journal_blocks_per_page(struct inode *inode);
/* journaling buffer types */
#define BJ_None 0 /* Not journaled */
#define BJ_SyncData 1 /* Normal data: flush before commit */
#define BJ_AsyncData 2 /* writepage data: wait on it before commit */
#define BJ_Metadata 3 /* Normal journaled metadata */
#define BJ_Forget 4 /* Buffer superseded by this transaction */
#define BJ_IO 5 /* Buffer is for temporary IO use */
#define BJ_Shadow 6 /* Buffer contents being shadowed to the log */
#define BJ_LogCtl 7 /* Buffer contains log descriptors */
#define BJ_Reserved 8 /* Buffer is reserved for access by journal */
#define BJ_Types 9
#define BJ_Metadata 2 /* Normal journaled metadata */
#define BJ_Forget 3 /* Buffer superseded by this transaction */
#define BJ_IO 4 /* Buffer is for temporary IO use */
#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */
#define BJ_LogCtl 6 /* Buffer contains log descriptors */
#define BJ_Reserved 7 /* Buffer is reserved for access by journal */
#define BJ_Types 8
extern int jbd_blocks_per_page(struct inode *inode);
......@@ -860,8 +851,7 @@ static inline int buffer_jdirty(struct buffer_head *bh)
static inline int buffer_jbd_data(struct buffer_head *bh)
{
return SPLICE_LOCK(buffer_jbd(bh),
bh2jh(bh)->b_jlist == BJ_SyncData ||
bh2jh(bh)->b_jlist == BJ_AsyncData);
bh2jh(bh)->b_jlist == BJ_SyncData);
}
#ifdef CONFIG_SMP
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment