Commit a3b71057 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] O_DIRECT for ext3

O_DIRECT support for ext3.

It works OK in all journalling modes.

Updates to the file metadata and inode are journalled as usual.

If the system crashes during an appending O_DIRECT write then journal
recovery will truncate the written-to file back to the length which it
had on entry to that write.

If the system crashes during a file overwrite to existing blocks then
the file contents will be an unknown mixture of old and new.

If the system crashes during a file overwrite which instantiates new
blocks in the middle of the file then there is a possibility of
uninitialised disk blocks being present in the file post-recovery.
parent bebff73c
...@@ -734,9 +734,9 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, ...@@ -734,9 +734,9 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
* The BKL may not be held on entry here. Be sure to take it early. * The BKL may not be held on entry here. Be sure to take it early.
*/ */
static int ext3_get_block_handle(handle_t *handle, struct inode *inode, static int
sector_t iblock, ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create) struct buffer_head *bh_result, int create, int extend_disksize)
{ {
int err = -EIO; int err = -EIO;
int offsets[4]; int offsets[4];
...@@ -818,16 +818,17 @@ static int ext3_get_block_handle(handle_t *handle, struct inode *inode, ...@@ -818,16 +818,17 @@ static int ext3_get_block_handle(handle_t *handle, struct inode *inode,
if (err) if (err)
goto cleanup; goto cleanup;
new_size = inode->i_size; if (extend_disksize) {
/* /*
* This is not racy against ext3_truncate's modification of i_disksize * This is not racy against ext3_truncate's modification of
* because VM/VFS ensures that the file cannot be extended while * i_disksize because VM/VFS ensures that the file cannot be
* truncate is in progress. It is racy between multiple parallel * extended while truncate is in progress. It is racy between
* instances of get_block, but we have the BKL. * multiple parallel instances of get_block, but we have BKL.
*/ */
if (new_size > ei->i_disksize) new_size = inode->i_size;
ei->i_disksize = new_size; if (new_size > ei->i_disksize)
ei->i_disksize = new_size;
}
set_buffer_new(bh_result); set_buffer_new(bh_result);
goto got_it; goto got_it;
...@@ -851,10 +852,43 @@ static int ext3_get_block(struct inode *inode, sector_t iblock, ...@@ -851,10 +852,43 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
handle = ext3_journal_current_handle(); handle = ext3_journal_current_handle();
J_ASSERT(handle != 0); J_ASSERT(handle != 0);
} }
ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create); ret = ext3_get_block_handle(handle, inode, iblock,
bh_result, create, 1);
return ret; return ret;
} }
#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
static int
ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
unsigned long max_blocks, struct buffer_head *bh_result,
int create)
{
handle_t *handle = journal_current_handle();
int ret = 0;
lock_kernel();
if (handle && handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
/*
* Getting low on buffer credits...
*/
if (!ext3_journal_extend(handle, DIO_CREDITS)) {
/*
* Couldn't extend the transaction. Start a new one
*/
ret = ext3_journal_restart(handle, DIO_CREDITS);
}
}
if (ret == 0)
ret = ext3_get_block_handle(handle, inode, iblock,
bh_result, create, 0);
if (ret == 0)
bh_result->b_size = (1 << inode->i_blkbits);
unlock_kernel();
return ret;
}
/* /*
* `handle' can be NULL if create is zero * `handle' can be NULL if create is zero
*/ */
...@@ -869,7 +903,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode, ...@@ -869,7 +903,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
dummy.b_state = 0; dummy.b_state = 0;
dummy.b_blocknr = -1000; dummy.b_blocknr = -1000;
buffer_trace_init(&dummy.b_history); buffer_trace_init(&dummy.b_history);
*errp = ext3_get_block_handle(handle, inode, block, &dummy, create); *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
if (!*errp && buffer_mapped(&dummy)) { if (!*errp && buffer_mapped(&dummy)) {
struct buffer_head *bh; struct buffer_head *bh;
bh = sb_getblk(inode->i_sb, dummy.b_blocknr); bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
...@@ -1377,17 +1411,83 @@ static int ext3_releasepage(struct page *page, int wait) ...@@ -1377,17 +1411,83 @@ static int ext3_releasepage(struct page *page, int wait)
return journal_try_to_free_buffers(journal, page, wait); return journal_try_to_free_buffers(journal, page, wait);
} }
/*
* If the O_DIRECT write will extend the file then add this inode to the
* orphan list. So recovery will truncate it back to the original size
* if the machine crashes during the write.
*
* If the O_DIRECT write is intantiating holes inside i_size and the machine
* crashes then stale disk data _may_ be exposed inside the file.
*/
static int ext3_direct_IO(int rw, struct inode *inode, char *buf,
loff_t offset, size_t count)
{
struct ext3_inode_info *ei = EXT3_I(inode);
handle_t *handle = NULL;
int ret;
int orphan = 0;
if (rw == WRITE) {
loff_t final_size = offset + count;
lock_kernel();
handle = ext3_journal_start(inode, DIO_CREDITS);
unlock_kernel();
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
}
if (final_size > inode->i_size) {
lock_kernel();
ret = ext3_orphan_add(handle, inode);
unlock_kernel();
if (ret)
goto out_stop;
orphan = 1;
ei->i_disksize = inode->i_size;
}
}
ret = generic_direct_IO(rw, inode, buf, offset,
count, ext3_direct_io_get_blocks);
out_stop:
if (handle) {
int err;
lock_kernel();
if (orphan)
ext3_orphan_del(handle, inode);
if (orphan && ret > 0) {
loff_t end = offset + ret;
if (end > inode->i_size) {
ei->i_disksize = end;
inode->i_size = end;
err = ext3_mark_inode_dirty(handle, inode);
if (!ret)
ret = err;
}
}
err = ext3_journal_stop(handle, inode);
if (ret == 0)
ret = err;
unlock_kernel();
}
out:
return ret;
}
struct address_space_operations ext3_aops = { struct address_space_operations ext3_aops = {
.readpage = ext3_readpage, /* BKL not held. Don't need */ .readpage = ext3_readpage, /* BKL not held. Don't need */
.readpages = ext3_readpages, /* BKL not held. Don't need */ .readpages = ext3_readpages, /* BKL not held. Don't need */
.writepage = ext3_writepage, /* BKL not held. We take it */ .writepage = ext3_writepage, /* BKL not held. We take it */
.sync_page = block_sync_page, .sync_page = block_sync_page,
.prepare_write = ext3_prepare_write, /* BKL not held. We take it */ .prepare_write = ext3_prepare_write, /* BKL not held. We take it */
.commit_write = ext3_commit_write, /* BKL not held. We take it */ .commit_write = ext3_commit_write, /* BKL not held. We take it */
.bmap = ext3_bmap, /* BKL held */ .bmap = ext3_bmap, /* BKL held */
.invalidatepage = ext3_invalidatepage, /* BKL not held. Don't need */ .invalidatepage = ext3_invalidatepage, /* BKL not held. Don't need */
.releasepage = ext3_releasepage, /* BKL not held. Don't need */ .releasepage = ext3_releasepage, /* BKL not held. Don't need */
.direct_IO = ext3_direct_IO, /* BKL not held. Don't need */
}; };
/* For writeback mode, we can use mpage_writepages() */ /* For writeback mode, we can use mpage_writepages() */
...@@ -1416,6 +1516,7 @@ struct address_space_operations ext3_writeback_aops = { ...@@ -1416,6 +1516,7 @@ struct address_space_operations ext3_writeback_aops = {
.bmap = ext3_bmap, /* BKL held */ .bmap = ext3_bmap, /* BKL held */
.invalidatepage = ext3_invalidatepage, /* BKL not held. Don't need */ .invalidatepage = ext3_invalidatepage, /* BKL not held. Don't need */
.releasepage = ext3_releasepage, /* BKL not held. Don't need */ .releasepage = ext3_releasepage, /* BKL not held. Don't need */
.direct_IO = ext3_direct_IO, /* BKL not held. Don't need */
}; };
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment