Commit 35806b4f authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (61 commits)
  jbd2: Add MAINTAINERS entry
  jbd2: fix a potential leak of a journal_head on an error path
  ext4: teach ext4_ext_split to calculate extents efficiently
  ext4: Convert ext4 to new truncate calling convention
  ext4: do not normalize block requests from fallocate()
  ext4: enable "punch hole" functionality
  ext4: add "punch hole" flag to ext4_map_blocks()
  ext4: punch out extents
  ext4: add new function ext4_block_zero_page_range()
  ext4: add flag to ext4_has_free_blocks
  ext4: reserve inodes and feature code for 'quota' feature
  ext4: add support for multiple mount protection
  ext4: ensure f_bfree returned by ext4_statfs() is non-negative
  ext4: protect bb_first_free in ext4_trim_all_free() with group lock
  ext4: only load buddy bitmap in ext4_trim_fs() when it is needed
  jbd2: Fix comment to match the code in jbd2__journal_start()
  ext4: fix waiting and sending of a barrier in ext4_sync_file()
  jbd2: Add function jbd2_trans_will_send_data_barrier()
  jbd2: fix sending of data flush on journal commit
  ext4: fix ext4_ext_fiemap_cb() to handle blocks before request range correctly
  ...
parents 32e51f14 d183e11a
...@@ -226,10 +226,6 @@ acl Enables POSIX Access Control Lists support. ...@@ -226,10 +226,6 @@ acl Enables POSIX Access Control Lists support.
noacl This option disables POSIX Access Control List noacl This option disables POSIX Access Control List
support. support.
reservation
noreservation
bsddf (*) Make 'df' act like BSD. bsddf (*) Make 'df' act like BSD.
minixdf Make 'df' act like Minix. minixdf Make 'df' act like Minix.
......
...@@ -3572,9 +3572,16 @@ M: Andrew Morton <akpm@linux-foundation.org> ...@@ -3572,9 +3572,16 @@ M: Andrew Morton <akpm@linux-foundation.org>
M: Jan Kara <jack@suse.cz> M: Jan Kara <jack@suse.cz>
L: linux-ext4@vger.kernel.org L: linux-ext4@vger.kernel.org
S: Maintained S: Maintained
F: fs/jbd*/ F: fs/jbd/
F: include/linux/ext*jbd*.h F: include/linux/ext3_jbd.h
F: include/linux/jbd*.h F: include/linux/jbd.h
JOURNALLING LAYER FOR BLOCK DEVICES (JBD2)
M: "Theodore Ts'o" <tytso@mit.edu>
L: linux-ext4@vger.kernel.org
S: Maintained
F: fs/jbd2/
F: include/linux/jbd2.h
JSM Neo PCI based serial card JSM Neo PCI based serial card
M: Breno Leitao <leitao@linux.vnet.ibm.com> M: Breno Leitao <leitao@linux.vnet.ibm.com>
......
...@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ...@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
mmp.o
ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
......
...@@ -361,130 +361,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) ...@@ -361,130 +361,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
return bh; return bh;
} }
/**
* ext4_add_groupblocks() -- Add given blocks to an existing group
* @handle: handle to this transaction
* @sb: super block
* @block: start physcial block to add to the block group
* @count: number of blocks to free
*
* This marks the blocks as free in the bitmap. We ask the
* mballoc to reload the buddy after this by setting group
* EXT4_GROUP_INFO_NEED_INIT_BIT flag
*/
void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count)
{
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *gd_bh;
ext4_group_t block_group;
ext4_grpblk_t bit;
unsigned int i;
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int err = 0, ret, blk_free_count;
ext4_grpblk_t blocks_freed;
struct ext4_group_info *grp;
ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
grp = ext4_get_group_info(sb, block_group);
/*
* Check to see if we are freeing blocks across a group
* boundary.
*/
if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
goto error_return;
}
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
desc = ext4_get_group_desc(sb, block_group, &gd_bh);
if (!desc)
goto error_return;
if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
in_range(ext4_inode_bitmap(sb, desc), block, count) ||
in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
in_range(block + count - 1, ext4_inode_table(sb, desc),
sbi->s_itb_per_group)) {
ext4_error(sb, "Adding blocks in system zones - "
"Block = %llu, count = %lu",
block, count);
goto error_return;
}
/*
* We are about to add blocks to the bitmap,
* so we need undo access.
*/
BUFFER_TRACE(bitmap_bh, "getting undo access");
err = ext4_journal_get_undo_access(handle, bitmap_bh);
if (err)
goto error_return;
/*
* We are about to modify some metadata. Call the journal APIs
* to unshare ->b_data if a currently-committing transaction is
* using it
*/
BUFFER_TRACE(gd_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gd_bh);
if (err)
goto error_return;
/*
* make sure we don't allow a parallel init on other groups in the
* same buddy cache
*/
down_write(&grp->alloc_sem);
for (i = 0, blocks_freed = 0; i < count; i++) {
BUFFER_TRACE(bitmap_bh, "clear bit");
if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
bit + i, bitmap_bh->b_data)) {
ext4_error(sb, "bit already cleared for block %llu",
(ext4_fsblk_t)(block + i));
BUFFER_TRACE(bitmap_bh, "bit already cleared");
} else {
blocks_freed++;
}
}
ext4_lock_group(sb, block_group);
blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
ext4_free_blks_set(sb, desc, blk_free_count);
desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
atomic_add(blocks_freed,
&sbi->s_flex_groups[flex_group].free_blocks);
}
/*
* request to reload the buddy with the
* new bitmap information
*/
set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
grp->bb_free += blocks_freed;
up_write(&grp->alloc_sem);
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
/* And the group descriptor block */
BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
if (!err)
err = ret;
error_return:
brelse(bitmap_bh);
ext4_std_error(sb, err);
return;
}
/** /**
* ext4_has_free_blocks() * ext4_has_free_blocks()
* @sbi: in-core super block structure. * @sbi: in-core super block structure.
...@@ -493,7 +369,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ...@@ -493,7 +369,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
* Check if filesystem has nblocks free & available for allocation. * Check if filesystem has nblocks free & available for allocation.
* On success return 1, return 0 on failure. * On success return 1, return 0 on failure.
*/ */
static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
s64 nblocks, unsigned int flags)
{ {
s64 free_blocks, dirty_blocks, root_blocks; s64 free_blocks, dirty_blocks, root_blocks;
struct percpu_counter *fbc = &sbi->s_freeblocks_counter; struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
...@@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) ...@@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
EXT4_FREEBLOCKS_WATERMARK) { EXT4_FREEBLOCKS_WATERMARK) {
free_blocks = percpu_counter_sum_positive(fbc); free_blocks = percpu_counter_sum_positive(fbc);
dirty_blocks = percpu_counter_sum_positive(dbc); dirty_blocks = percpu_counter_sum_positive(dbc);
if (dirty_blocks < 0) {
printk(KERN_CRIT "Dirty block accounting "
"went wrong %lld\n",
(long long)dirty_blocks);
}
} }
/* Check whether we have space after /* Check whether we have space after
* accounting for current dirty blocks & root reserved blocks. * accounting for current dirty blocks & root reserved blocks.
...@@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) ...@@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
/* Hm, nope. Are (enough) root reserved blocks available? */ /* Hm, nope. Are (enough) root reserved blocks available? */
if (sbi->s_resuid == current_fsuid() || if (sbi->s_resuid == current_fsuid() ||
((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
capable(CAP_SYS_RESOURCE)) { capable(CAP_SYS_RESOURCE) ||
(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
if (free_blocks >= (nblocks + dirty_blocks)) if (free_blocks >= (nblocks + dirty_blocks))
return 1; return 1;
} }
...@@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) ...@@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
} }
int ext4_claim_free_blocks(struct ext4_sb_info *sbi, int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
s64 nblocks) s64 nblocks, unsigned int flags)
{ {
if (ext4_has_free_blocks(sbi, nblocks)) { if (ext4_has_free_blocks(sbi, nblocks, flags)) {
percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
return 0; return 0;
} else } else
...@@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi, ...@@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
*/ */
int ext4_should_retry_alloc(struct super_block *sb, int *retries) int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{ {
if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
(*retries)++ > 3 || (*retries)++ > 3 ||
!EXT4_SB(sb)->s_journal) !EXT4_SB(sb)->s_journal)
return 0; return 0;
...@@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) ...@@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* error stores in errp pointer * error stores in errp pointer
*/ */
ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp) ext4_fsblk_t goal, unsigned int flags,
unsigned long *count, int *errp)
{ {
struct ext4_allocation_request ar; struct ext4_allocation_request ar;
ext4_fsblk_t ret; ext4_fsblk_t ret;
...@@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ...@@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ar.inode = inode; ar.inode = inode;
ar.goal = goal; ar.goal = goal;
ar.len = count ? *count : 1; ar.len = count ? *count : 1;
ar.flags = flags;
ret = ext4_mb_new_blocks(handle, &ar, errp); ret = ext4_mb_new_blocks(handle, &ar, errp);
if (count) if (count)
......
This diff is collapsed.
...@@ -6,20 +6,6 @@ ...@@ -6,20 +6,6 @@
#include <trace/events/ext4.h> #include <trace/events/ext4.h>
int __ext4_journal_get_undo_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh)
{
int err = 0;
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_undo_access(handle, bh);
if (err)
ext4_journal_abort_handle(where, line, __func__, bh,
handle, err);
}
return err;
}
int __ext4_journal_get_write_access(const char *where, unsigned int line, int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh) handle_t *handle, struct buffer_head *bh)
{ {
......
...@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line, ...@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
const char *err_fn, const char *err_fn,
struct buffer_head *bh, handle_t *handle, int err); struct buffer_head *bh, handle_t *handle, int err);
int __ext4_journal_get_undo_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh);
int __ext4_journal_get_write_access(const char *where, unsigned int line, int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh); handle_t *handle, struct buffer_head *bh);
...@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, ...@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
int __ext4_handle_dirty_super(const char *where, unsigned int line, int __ext4_handle_dirty_super(const char *where, unsigned int line,
handle_t *handle, struct super_block *sb); handle_t *handle, struct super_block *sb);
#define ext4_journal_get_undo_access(handle, bh) \
__ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
#define ext4_journal_get_write_access(handle, bh) \ #define ext4_journal_get_write_access(handle, bh) \
__ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
......
This diff is collapsed.
...@@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = { ...@@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = {
}; };
const struct inode_operations ext4_file_inode_operations = { const struct inode_operations ext4_file_inode_operations = {
.truncate = ext4_truncate,
.setattr = ext4_setattr, .setattr = ext4_setattr,
.getattr = ext4_getattr, .getattr = ext4_getattr,
#ifdef CONFIG_EXT4_FS_XATTR #ifdef CONFIG_EXT4_FS_XATTR
......
...@@ -36,7 +36,7 @@ ...@@ -36,7 +36,7 @@
static void dump_completed_IO(struct inode * inode) static void dump_completed_IO(struct inode * inode)
{ {
#ifdef EXT4_DEBUG #ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after; struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1; ext4_io_end_t *io, *io0, *io1;
unsigned long flags; unsigned long flags;
...@@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync) ...@@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync)
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret; int ret;
tid_t commit_tid; tid_t commit_tid;
bool needs_barrier = false;
J_ASSERT(ext4_journal_current_handle() == NULL); J_ASSERT(ext4_journal_current_handle() == NULL);
...@@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync) ...@@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync)
} }
commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
if (jbd2_log_start_commit(journal, commit_tid)) { if (journal->j_flags & JBD2_BARRIER &&
/* !jbd2_trans_will_send_data_barrier(journal, commit_tid))
* When the journal is on a different device than the needs_barrier = true;
* fs data disk, we need to issue the barrier in jbd2_log_start_commit(journal, commit_tid);
* writeback mode. (In ordered mode, the jbd2 layer ret = jbd2_log_wait_commit(journal, commit_tid);
* will take care of issuing the barrier. In if (needs_barrier)
* data=journal, all of the data blocks are written to
* the journal device.)
*/
if (ext4_should_writeback_data(inode) &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
NULL);
ret = jbd2_log_wait_commit(journal, commit_tid);
} else if (journal->j_flags & JBD2_BARRIER)
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
out: out:
trace_ext4_sync_file_exit(inode, ret); trace_ext4_sync_file_exit(inode, ret);
......
...@@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, ...@@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
while (target > 0) { while (target > 0) {
count = target; count = target;
/* allocating blocks for indirect blocks and direct blocks */ /* allocating blocks for indirect blocks and direct blocks */
current_block = ext4_new_meta_blocks(handle, inode, current_block = ext4_new_meta_blocks(handle, inode, goal,
goal, &count, err); 0, &count, err);
if (*err) if (*err)
goto failed_out; goto failed_out;
...@@ -1930,7 +1930,7 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) ...@@ -1930,7 +1930,7 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
* We do still charge estimated metadata to the sb though; * We do still charge estimated metadata to the sb though;
* we cannot afford to run out of free blocks. * we cannot afford to run out of free blocks.
*/ */
if (ext4_claim_free_blocks(sbi, md_needed + 1)) { if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
dquot_release_reservation_block(inode, 1); dquot_release_reservation_block(inode, 1);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) { if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield(); yield();
...@@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping, ...@@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping,
continue; continue;
} }
if (PageWriteback(page)) wait_on_page_writeback(page);
wait_on_page_writeback(page);
BUG_ON(PageWriteback(page)); BUG_ON(PageWriteback(page));
if (mpd->next_page != page->index) if (mpd->next_page != page->index)
...@@ -3513,7 +3511,7 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, ...@@ -3513,7 +3511,7 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
loff_t end = offset + iov_length(iov, nr_segs); loff_t end = offset + iov_length(iov, nr_segs);
if (end > isize) if (end > isize)
vmtruncate(inode, isize); ext4_truncate_failed_write(inode);
} }
} }
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
...@@ -3915,10 +3913,31 @@ void ext4_set_aops(struct inode *inode) ...@@ -3915,10 +3913,31 @@ void ext4_set_aops(struct inode *inode)
*/ */
int ext4_block_truncate_page(handle_t *handle, int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from) struct address_space *mapping, loff_t from)
{
unsigned offset = from & (PAGE_CACHE_SIZE-1);
unsigned length;
unsigned blocksize;
struct inode *inode = mapping->host;
blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1));
return ext4_block_zero_page_range(handle, mapping, from, length);
}
/*
* ext4_block_zero_page_range() zeros out a mapping of length 'length'
* starting from file offset 'from'. The range to be zero'd must
* be contained with in one block. If the specified range exceeds
* the end of the block it will be shortened to end of the block
* that cooresponds to 'from'
*/
int ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{ {
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1); unsigned offset = from & (PAGE_CACHE_SIZE-1);
unsigned blocksize, length, pos; unsigned blocksize, max, pos;
ext4_lblk_t iblock; ext4_lblk_t iblock;
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
struct buffer_head *bh; struct buffer_head *bh;
...@@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle, ...@@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
return -EINVAL; return -EINVAL;
blocksize = inode->i_sb->s_blocksize; blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1)); max = blocksize - (offset & (blocksize - 1));
/*
* correct length if it does not fall between
* 'from' and the end of the block
*/
if (length > max || length < 0)
length = max;
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
if (!page_has_buffers(page)) if (!page_has_buffers(page))
...@@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, ...@@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
int ext4_can_truncate(struct inode *inode) int ext4_can_truncate(struct inode *inode)
{ {
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return 0;
if (S_ISREG(inode->i_mode)) if (S_ISREG(inode->i_mode))
return 1; return 1;
if (S_ISDIR(inode->i_mode)) if (S_ISDIR(inode->i_mode))
...@@ -4391,6 +4416,31 @@ int ext4_can_truncate(struct inode *inode) ...@@ -4391,6 +4416,31 @@ int ext4_can_truncate(struct inode *inode)
return 0; return 0;
} }
/*
* ext4_punch_hole: punches a hole in a file by releaseing the blocks
* associated with the given offset and length
*
* @inode: File inode
* @offset: The offset where the hole will begin
* @len: The length of the hole
*
* Returns: 0 on sucess or negative on failure
*/
int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
struct inode *inode = file->f_path.dentry->d_inode;
if (!S_ISREG(inode->i_mode))
return -ENOTSUPP;
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
/* TODO: Add support for non extent hole punching */
return -ENOTSUPP;
}
return ext4_ext_punch_hole(file, offset, length);
}
/* /*
* ext4_truncate() * ext4_truncate()
* *
...@@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode, ...@@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
/* /*
* Figure out the offset within the block group inode table * Figure out the offset within the block group inode table
*/ */
inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
inode_offset = ((inode->i_ino - 1) % inode_offset = ((inode->i_ino - 1) %
EXT4_INODES_PER_GROUP(sb)); EXT4_INODES_PER_GROUP(sb));
block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
...@@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (S_ISREG(inode->i_mode) && if (S_ISREG(inode->i_mode) &&
attr->ia_valid & ATTR_SIZE && attr->ia_valid & ATTR_SIZE &&
(attr->ia_size < inode->i_size || (attr->ia_size < inode->i_size)) {
(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
handle_t *handle; handle_t *handle;
handle = ext4_journal_start(inode, 3); handle = ext4_journal_start(inode, 3);
...@@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
goto err_out; goto err_out;
} }
} }
/* ext4_truncate will clear the flag */
if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
ext4_truncate(inode);
} }
if ((attr->ia_valid & ATTR_SIZE) && if (attr->ia_valid & ATTR_SIZE) {
attr->ia_size != i_size_read(inode)) if (attr->ia_size != i_size_read(inode)) {
rc = vmtruncate(inode, attr->ia_size); truncate_setsize(inode, attr->ia_size);
ext4_truncate(inode);
} else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
ext4_truncate(inode);
}
if (!rc) { if (!rc) {
setattr_copy(inode, attr); setattr_copy(inode, attr);
...@@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out_unlock; goto out_unlock;
} }
ret = 0; ret = 0;
if (PageMappedToDisk(page))
goto out_unlock; lock_page(page);
wait_on_page_writeback(page);
if (PageMappedToDisk(page)) {
up_read(&inode->i_alloc_sem);
return VM_FAULT_LOCKED;
}
if (page->index == size >> PAGE_CACHE_SHIFT) if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK; len = size & ~PAGE_CACHE_MASK;
else else
len = PAGE_CACHE_SIZE; len = PAGE_CACHE_SIZE;
lock_page(page);
/* /*
* return if we have all the buffers mapped. This avoid * return if we have all the buffers mapped. This avoid
* the need to call write_begin/write_end which does a * the need to call write_begin/write_end which does a
...@@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (page_has_buffers(page)) { if (page_has_buffers(page)) {
if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
ext4_bh_unmapped)) { ext4_bh_unmapped)) {
unlock_page(page); up_read(&inode->i_alloc_sem);
goto out_unlock; return VM_FAULT_LOCKED;
} }
} }
unlock_page(page); unlock_page(page);
...@@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (ret < 0) if (ret < 0)
goto out_unlock; goto out_unlock;
ret = 0; ret = 0;
/*
* write_begin/end might have created a dirty page and someone
* could wander in and start the IO. Make sure that hasn't
* happened.
*/
lock_page(page);
wait_on_page_writeback(page);
up_read(&inode->i_alloc_sem);
return VM_FAULT_LOCKED;
out_unlock: out_unlock:
if (ret) if (ret)
ret = VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS;
......
This diff is collapsed.
...@@ -193,11 +193,6 @@ struct ext4_allocation_context { ...@@ -193,11 +193,6 @@ struct ext4_allocation_context {
__u8 ac_op; /* operation, for history only */ __u8 ac_op; /* operation, for history only */
struct page *ac_bitmap_page; struct page *ac_bitmap_page;
struct page *ac_buddy_page; struct page *ac_buddy_page;
/*
* pointer to the held semaphore upon successful
* block allocation
*/
struct rw_semaphore *alloc_semp;
struct ext4_prealloc_space *ac_pa; struct ext4_prealloc_space *ac_pa;
struct ext4_locality_group *ac_lg; struct ext4_locality_group *ac_lg;
}; };
...@@ -215,7 +210,6 @@ struct ext4_buddy { ...@@ -215,7 +210,6 @@ struct ext4_buddy {
struct super_block *bd_sb; struct super_block *bd_sb;
__u16 bd_blkbits; __u16 bd_blkbits;
ext4_group_t bd_group; ext4_group_t bd_group;
struct rw_semaphore *alloc_semp;
}; };
#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
......
...@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, ...@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* We have the extent map build with the tmp inode. * We have the extent map build with the tmp inode.
* Now copy the i_data across * Now copy the i_data across
*/ */
ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
/* /*
......
#include <linux/fs.h>
#include <linux/random.h>
#include <linux/buffer_head.h>
#include <linux/utsname.h>
#include <linux/kthread.h>
#include "ext4.h"
/*
* Write the MMP block using WRITE_SYNC to try to get the block on-disk
* faster.
*/
static int write_mmp_block(struct buffer_head *bh)
{
mark_buffer_dirty(bh);
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
submit_bh(WRITE_SYNC, bh);
wait_on_buffer(bh);
if (unlikely(!buffer_uptodate(bh)))
return 1;
return 0;
}
/*
* Read the MMP block. It _must_ be read from disk and hence we clear the
* uptodate flag on the buffer.
*/
static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
ext4_fsblk_t mmp_block)
{
struct mmp_struct *mmp;
if (*bh)
clear_buffer_uptodate(*bh);
/* This would be sb_bread(sb, mmp_block), except we need to be sure
* that the MD RAID device cache has been bypassed, and that the read
* is not blocked in the elevator. */
if (!*bh)
*bh = sb_getblk(sb, mmp_block);
if (*bh) {
get_bh(*bh);
lock_buffer(*bh);
(*bh)->b_end_io = end_buffer_read_sync;
submit_bh(READ_SYNC, *bh);
wait_on_buffer(*bh);
if (!buffer_uptodate(*bh)) {
brelse(*bh);
*bh = NULL;
}
}
if (!*bh) {
ext4_warning(sb, "Error while reading MMP block %llu",
mmp_block);
return -EIO;
}
mmp = (struct mmp_struct *)((*bh)->b_data);
if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
return -EINVAL;
return 0;
}
/*
* Dump as much information as possible to help the admin.
*/
void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
const char *function, unsigned int line, const char *msg)
{
__ext4_warning(sb, function, line, msg);
__ext4_warning(sb, function, line,
"MMP failure info: last update time: %llu, last update "
"node: %s, last update device: %s\n",
(long long unsigned int) le64_to_cpu(mmp->mmp_time),
mmp->mmp_nodename, mmp->mmp_bdevname);
}
/*
* kmmpd will update the MMP sequence every s_mmp_update_interval seconds
*/
static int kmmpd(void *data)
{
struct super_block *sb = ((struct mmpd_data *) data)->sb;
struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct mmp_struct *mmp;
ext4_fsblk_t mmp_block;
u32 seq = 0;
unsigned long failed_writes = 0;
int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
unsigned mmp_check_interval;
unsigned long last_update_time;
unsigned long diff;
int retval;
mmp_block = le64_to_cpu(es->s_mmp_block);
mmp = (struct mmp_struct *)(bh->b_data);
mmp->mmp_time = cpu_to_le64(get_seconds());
/*
* Start with the higher mmp_check_interval and reduce it if
* the MMP block is being updated on time.
*/
mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
EXT4_MMP_MIN_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
bdevname(bh->b_bdev, mmp->mmp_bdevname);
memcpy(mmp->mmp_nodename, init_utsname()->sysname,
sizeof(mmp->mmp_nodename));
while (!kthread_should_stop()) {
if (++seq > EXT4_MMP_SEQ_MAX)
seq = 1;
mmp->mmp_seq = cpu_to_le32(seq);
mmp->mmp_time = cpu_to_le64(get_seconds());
last_update_time = jiffies;
retval = write_mmp_block(bh);
/*
* Don't spew too many error messages. Print one every
* (s_mmp_update_interval * 60) seconds.
*/
if (retval && (failed_writes % 60) == 0) {
ext4_error(sb, "Error writing to MMP block");
failed_writes++;
}
if (!(le32_to_cpu(es->s_feature_incompat) &
EXT4_FEATURE_INCOMPAT_MMP)) {
ext4_warning(sb, "kmmpd being stopped since MMP feature"
" has been disabled.");
EXT4_SB(sb)->s_mmp_tsk = NULL;
goto failed;
}
if (sb->s_flags & MS_RDONLY) {
ext4_warning(sb, "kmmpd being stopped since filesystem "
"has been remounted as readonly.");
EXT4_SB(sb)->s_mmp_tsk = NULL;
goto failed;
}
diff = jiffies - last_update_time;
if (diff < mmp_update_interval * HZ)
schedule_timeout_interruptible(mmp_update_interval *
HZ - diff);
/*
* We need to make sure that more than mmp_check_interval
* seconds have not passed since writing. If that has happened
* we need to check if the MMP block is as we left it.
*/
diff = jiffies - last_update_time;
if (diff > mmp_check_interval * HZ) {
struct buffer_head *bh_check = NULL;
struct mmp_struct *mmp_check;
retval = read_mmp_block(sb, &bh_check, mmp_block);
if (retval) {
ext4_error(sb, "error reading MMP data: %d",
retval);
EXT4_SB(sb)->s_mmp_tsk = NULL;
goto failed;
}
mmp_check = (struct mmp_struct *)(bh_check->b_data);
if (mmp->mmp_seq != mmp_check->mmp_seq ||
memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
sizeof(mmp->mmp_nodename))) {
dump_mmp_msg(sb, mmp_check,
"Error while updating MMP info. "
"The filesystem seems to have been"
" multiply mounted.");
ext4_error(sb, "abort");
goto failed;
}
put_bh(bh_check);
}
/*
* Adjust the mmp_check_interval depending on how much time
* it took for the MMP block to be written.
*/
mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
EXT4_MMP_MAX_CHECK_INTERVAL),
EXT4_MMP_MIN_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
}
/*
* Unmount seems to be clean.
*/
mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
mmp->mmp_time = cpu_to_le64(get_seconds());
retval = write_mmp_block(bh);
failed:
kfree(data);
brelse(bh);
return retval;
}
/*
* Get a random new sequence number but make sure it is not greater than
* EXT4_MMP_SEQ_MAX.
*/
static unsigned int mmp_new_seq(void)
{
u32 new_seq;
do {
get_random_bytes(&new_seq, sizeof(u32));
} while (new_seq > EXT4_MMP_SEQ_MAX);
return new_seq;
}
/*
* Protect the filesystem from being mounted more than once.
*/
int ext4_multi_mount_protect(struct super_block *sb,
ext4_fsblk_t mmp_block)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct buffer_head *bh = NULL;
struct mmp_struct *mmp = NULL;
struct mmpd_data *mmpd_data;
u32 seq;
unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
unsigned int wait_time = 0;
int retval;
if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
mmp_block >= ext4_blocks_count(es)) {
ext4_warning(sb, "Invalid MMP block in superblock");
goto failed;
}
retval = read_mmp_block(sb, &bh, mmp_block);
if (retval)
goto failed;
mmp = (struct mmp_struct *)(bh->b_data);
if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
/*
* If check_interval in MMP block is larger, use that instead of
* update_interval from the superblock.
*/
if (mmp->mmp_check_interval > mmp_check_interval)
mmp_check_interval = mmp->mmp_check_interval;
seq = le32_to_cpu(mmp->mmp_seq);
if (seq == EXT4_MMP_SEQ_CLEAN)
goto skip;
if (seq == EXT4_MMP_SEQ_FSCK) {
dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
goto failed;
}
wait_time = min(mmp_check_interval * 2 + 1,
mmp_check_interval + 60);
/* Print MMP interval if more than 20 secs. */
if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
ext4_warning(sb, "MMP interval %u higher than expected, please"
" wait.\n", wait_time * 2);
if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
ext4_warning(sb, "MMP startup interrupted, failing mount\n");
goto failed;
}
retval = read_mmp_block(sb, &bh, mmp_block);
if (retval)
goto failed;
mmp = (struct mmp_struct *)(bh->b_data);
if (seq != le32_to_cpu(mmp->mmp_seq)) {
dump_mmp_msg(sb, mmp,
"Device is already active on another node.");
goto failed;
}
skip:
/*
* write a new random sequence number.
*/
mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
retval = write_mmp_block(bh);
if (retval)
goto failed;
/*
* wait for MMP interval and check mmp_seq.
*/
if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
ext4_warning(sb, "MMP startup interrupted, failing mount\n");
goto failed;
}
retval = read_mmp_block(sb, &bh, mmp_block);
if (retval)
goto failed;
mmp = (struct mmp_struct *)(bh->b_data);
if (seq != le32_to_cpu(mmp->mmp_seq)) {
dump_mmp_msg(sb, mmp,
"Device is already active on another node.");
goto failed;
}
mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
if (!mmpd_data) {
ext4_warning(sb, "not enough memory for mmpd_data");
goto failed;
}
mmpd_data->sb = sb;
mmpd_data->bh = bh;
/*
* Start a kernel thread to update the MMP block periodically.
*/
EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
bdevname(bh->b_bdev,
mmp->mmp_bdevname));
if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
EXT4_SB(sb)->s_mmp_tsk = NULL;
kfree(mmpd_data);
ext4_warning(sb, "Unable to create kmmpd thread for %s.",
sb->s_id);
goto failed;
}
return 0;
failed:
brelse(bh);
return 1;
}
...@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, ...@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
* It needs to call wait_on_page_writeback() to wait for the * It needs to call wait_on_page_writeback() to wait for the
* writeback of the page. * writeback of the page.
*/ */
if (PageWriteback(page)) wait_on_page_writeback(page);
wait_on_page_writeback(page);
/* Release old bh and drop refs */ /* Release old bh and drop refs */
try_to_release_page(page, 0); try_to_release_page(page, 0);
......
...@@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, ...@@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
frame->at = entries; frame->at = entries;
frame->bh = bh; frame->bh = bh;
bh = bh2; bh = bh2;
ext4_handle_dirty_metadata(handle, dir, frame->bh);
ext4_handle_dirty_metadata(handle, dir, bh);
de = do_split(handle,dir, &bh, frame, &hinfo, &retval); de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
dx_release (frames); if (!de) {
if (!(de)) /*
* Even if the block split failed, we have to properly write
* out all the changes we did so far. Otherwise we can end up
* with corrupted filesystem.
*/
ext4_mark_inode_dirty(handle, dir);
dx_release(frames);
return retval; return retval;
}
dx_release(frames);
retval = add_dirent_to_buf(handle, dentry, inode, de, bh); retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
brelse(bh); brelse(bh);
...@@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir, ...@@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir,
handle_t *handle; handle_t *handle;
struct inode *inode; struct inode *inode;
int l, err, retries = 0; int l, err, retries = 0;
int credits;
l = strlen(symname)+1; l = strlen(symname)+1;
if (l > dir->i_sb->s_blocksize) if (l > dir->i_sb->s_blocksize)
...@@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir, ...@@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir,
dquot_initialize(dir); dquot_initialize(dir);
if (l > EXT4_N_BLOCKS * 4) {
/*
* For non-fast symlinks, we just allocate inode and put it on
* orphan list in the first transaction => we need bitmap,
* group descriptor, sb, inode block, quota blocks.
*/
credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
} else {
/*
* Fast symlink. We have to add entry to directory
* (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
* allocate new inode (bitmap, group descriptor, inode block,
* quota blocks, sb is already counted in previous macros).
*/
credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
}
retry: retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + handle = ext4_journal_start(dir, credits);
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle)) if (IS_ERR(handle))
return PTR_ERR(handle); return PTR_ERR(handle);
...@@ -2263,21 +2292,44 @@ static int ext4_symlink(struct inode *dir, ...@@ -2263,21 +2292,44 @@ static int ext4_symlink(struct inode *dir,
if (IS_ERR(inode)) if (IS_ERR(inode))
goto out_stop; goto out_stop;
if (l > sizeof(EXT4_I(inode)->i_data)) { if (l > EXT4_N_BLOCKS * 4) {
inode->i_op = &ext4_symlink_inode_operations; inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode); ext4_set_aops(inode);
/* /*
* page_symlink() calls into ext4_prepare/commit_write. * We cannot call page_symlink() with transaction started
* We have a transaction open. All is sweetness. It also sets * because it calls into ext4_write_begin() which can wait
* i_size in generic_commit_write(). * for transaction commit if we are running out of space
* and thus we deadlock. So we have to stop transaction now
* and restart it when symlink contents is written.
*
* To keep fs consistent in case of crash, we have to put inode
* to orphan list in the mean time.
*/ */
drop_nlink(inode);
err = ext4_orphan_add(handle, inode);
ext4_journal_stop(handle);
if (err)
goto err_drop_inode;
err = __page_symlink(inode, symname, l, 1); err = __page_symlink(inode, symname, l, 1);
if (err)
goto err_drop_inode;
/*
* Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
* + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
*/
handle = ext4_journal_start(dir,
EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto err_drop_inode;
}
inc_nlink(inode);
err = ext4_orphan_del(handle, inode);
if (err) { if (err) {
ext4_journal_stop(handle);
clear_nlink(inode); clear_nlink(inode);
unlock_new_inode(inode); goto err_drop_inode;
ext4_mark_inode_dirty(handle, inode);
iput(inode);
goto out_stop;
} }
} else { } else {
/* clear the extent format for fast symlink */ /* clear the extent format for fast symlink */
...@@ -2293,6 +2345,10 @@ static int ext4_symlink(struct inode *dir, ...@@ -2293,6 +2345,10 @@ static int ext4_symlink(struct inode *dir,
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry; goto retry;
return err; return err;
err_drop_inode:
unlock_new_inode(inode);
iput(inode);
return err;
} }
static int ext4_link(struct dentry *old_dentry, static int ext4_link(struct dentry *old_dentry,
......
...@@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error) ...@@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error)
for (i = 0; i < io_end->num_io_pages; i++) { for (i = 0; i < io_end->num_io_pages; i++) {
struct page *page = io_end->pages[i]->p_page; struct page *page = io_end->pages[i]->p_page;
struct buffer_head *bh, *head; struct buffer_head *bh, *head;
int partial_write = 0; loff_t offset;
loff_t io_end_offset;
head = page_buffers(page); if (error) {
if (error)
SetPageError(page); SetPageError(page);
BUG_ON(!head); set_bit(AS_EIO, &page->mapping->flags);
if (head->b_size != PAGE_CACHE_SIZE) { head = page_buffers(page);
loff_t offset; BUG_ON(!head);
loff_t io_end_offset = io_end->offset + io_end->size;
io_end_offset = io_end->offset + io_end->size;
offset = (sector_t) page->index << PAGE_CACHE_SHIFT; offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
bh = head; bh = head;
do { do {
if ((offset >= io_end->offset) && if ((offset >= io_end->offset) &&
(offset+bh->b_size <= io_end_offset)) { (offset+bh->b_size <= io_end_offset))
if (error) buffer_io_error(bh);
buffer_io_error(bh);
}
if (buffer_delay(bh))
partial_write = 1;
else if (!buffer_mapped(bh))
clear_buffer_dirty(bh);
else if (buffer_dirty(bh))
partial_write = 1;
offset += bh->b_size; offset += bh->b_size;
bh = bh->b_this_page; bh = bh->b_this_page;
} while (bh != head); } while (bh != head);
} }
/*
* If this is a partial write which happened to make
* all buffers uptodate then we can optimize away a
* bogus readpage() for the next read(). Here we
* 'discover' whether the page went uptodate as a
* result of this (potentially partial) write.
*/
if (!partial_write)
SetPageUptodate(page);
put_io_page(io_end->pages[i]); put_io_page(io_end->pages[i]);
} }
io_end->num_io_pages = 0; io_end->num_io_pages = 0;
......
This diff is collapsed.
...@@ -820,8 +820,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ...@@ -820,8 +820,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
block = ext4_new_meta_blocks(handle, inode, block = ext4_new_meta_blocks(handle, inode, goal, 0,
goal, NULL, &error); NULL, &error);
if (error) if (error)
goto cleanup; goto cleanup;
......
...@@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal, ...@@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal,
ret = err; ret = err;
spin_lock(&journal->j_list_lock); spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction); J_ASSERT(jinode->i_transaction == commit_transaction);
commit_transaction->t_flushed_data_blocks = 1;
clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
smp_mb__after_clear_bit(); smp_mb__after_clear_bit();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
...@@ -672,12 +671,16 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -672,12 +671,16 @@ void jbd2_journal_commit_transaction(journal_t *journal)
err = 0; err = 0;
} }
write_lock(&journal->j_state_lock);
J_ASSERT(commit_transaction->t_state == T_COMMIT);
commit_transaction->t_state = T_COMMIT_DFLUSH;
write_unlock(&journal->j_state_lock);
/* /*
* If the journal is not located on the file system device, * If the journal is not located on the file system device,
* then we must flush the file system device before we issue * then we must flush the file system device before we issue
* the commit record * the commit record
*/ */
if (commit_transaction->t_flushed_data_blocks && if (commit_transaction->t_need_data_flush &&
(journal->j_fs_dev != journal->j_dev) && (journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER)) (journal->j_flags & JBD2_BARRIER))
blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
...@@ -754,8 +757,13 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -754,8 +757,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
required. */ required. */
JBUFFER_TRACE(jh, "file as BJ_Forget"); JBUFFER_TRACE(jh, "file as BJ_Forget");
jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
/* Wake up any transactions which were waiting for this /*
IO to complete */ * Wake up any transactions which were waiting for this IO to
* complete. The barrier must be here so that changes by
* jbd2_journal_file_buffer() take effect before wake_up_bit()
* does the waitqueue check.
*/
smp_mb();
wake_up_bit(&bh->b_state, BH_Unshadow); wake_up_bit(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "brelse shadowed buffer"); JBUFFER_TRACE(jh, "brelse shadowed buffer");
__brelse(bh); __brelse(bh);
...@@ -794,6 +802,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -794,6 +802,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_abort(journal, err); jbd2_journal_abort(journal, err);
jbd_debug(3, "JBD: commit phase 5\n"); jbd_debug(3, "JBD: commit phase 5\n");
write_lock(&journal->j_state_lock);
J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
commit_transaction->t_state = T_COMMIT_JFLUSH;
write_unlock(&journal->j_state_lock);
if (!JBD2_HAS_INCOMPAT_FEATURE(journal, if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
...@@ -949,7 +961,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -949,7 +961,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd_debug(3, "JBD: commit phase 7\n"); jbd_debug(3, "JBD: commit phase 7\n");
J_ASSERT(commit_transaction->t_state == T_COMMIT); J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
commit_transaction->t_start = jiffies; commit_transaction->t_start = jiffies;
stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
......
...@@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal) ...@@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal)
int __jbd2_log_start_commit(journal_t *journal, tid_t target) int __jbd2_log_start_commit(journal_t *journal, tid_t target)
{ {
/* /*
* Are we already doing a recent enough commit? * The only transaction we can possibly wait upon is the
* currently running transaction (if it exists). Otherwise,
* the target tid must be an old one.
*/ */
if (!tid_geq(journal->j_commit_request, target)) { if (journal->j_running_transaction &&
journal->j_running_transaction->t_tid == target) {
/* /*
* We want a new commit: OK, mark the request and wakeup the * We want a new commit: OK, mark the request and wakeup the
* commit thread. We do _not_ do the commit ourselves. * commit thread. We do _not_ do the commit ourselves.
...@@ -493,7 +496,15 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) ...@@ -493,7 +496,15 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
journal->j_commit_sequence); journal->j_commit_sequence);
wake_up(&journal->j_wait_commit); wake_up(&journal->j_wait_commit);
return 1; return 1;
} } else if (!tid_geq(journal->j_commit_request, target))
/* This should never happen, but if it does, preserve
the evidence before kjournald goes into a loop and
increments j_commit_sequence beyond all recognition. */
WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
journal->j_commit_request,
journal->j_commit_sequence,
target, journal->j_running_transaction ?
journal->j_running_transaction->t_tid : 0);
return 0; return 0;
} }
...@@ -576,6 +587,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) ...@@ -576,6 +587,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
return ret; return ret;
} }
/*
* Return 1 if a given transaction has not yet sent barrier request
* connected with a transaction commit. If 0 is returned, transaction
* may or may not have sent the barrier. Used to avoid sending barrier
* twice in common cases.
*/
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
{
int ret = 0;
transaction_t *commit_trans;
if (!(journal->j_flags & JBD2_BARRIER))
return 0;
read_lock(&journal->j_state_lock);
/* Transaction already committed? */
if (tid_geq(journal->j_commit_sequence, tid))
goto out;
commit_trans = journal->j_committing_transaction;
if (!commit_trans || commit_trans->t_tid != tid) {
ret = 1;
goto out;
}
/*
* Transaction is being committed and we already proceeded to
* submitting a flush to fs partition?
*/
if (journal->j_fs_dev != journal->j_dev) {
if (!commit_trans->t_need_data_flush ||
commit_trans->t_state >= T_COMMIT_DFLUSH)
goto out;
} else {
if (commit_trans->t_state >= T_COMMIT_JFLUSH)
goto out;
}
ret = 1;
out:
read_unlock(&journal->j_state_lock);
return ret;
}
EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
/* /*
* Wait for a specified commit to complete. * Wait for a specified commit to complete.
* The caller may not hold the journal lock. * The caller may not hold the journal lock.
......
...@@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) ...@@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
*/ */
/* /*
* Update transiaction's maximum wait time, if debugging is enabled. * Update transaction's maximum wait time, if debugging is enabled.
* *
* In order for t_max_wait to be reliable, it must be protected by a * In order for t_max_wait to be reliable, it must be protected by a
* lock. But doing so will mean that start_this_handle() can not be * lock. But doing so will mean that start_this_handle() can not be
...@@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) ...@@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
* means that maximum wait time reported by the jbd2_run_stats * means that maximum wait time reported by the jbd2_run_stats
* tracepoint will always be zero. * tracepoint will always be zero.
*/ */
static inline void update_t_max_wait(transaction_t *transaction) static inline void update_t_max_wait(transaction_t *transaction,
unsigned long ts)
{ {
#ifdef CONFIG_JBD2_DEBUG #ifdef CONFIG_JBD2_DEBUG
unsigned long ts = jiffies;
if (jbd2_journal_enable_debug && if (jbd2_journal_enable_debug &&
time_after(transaction->t_start, ts)) { time_after(transaction->t_start, ts)) {
ts = jbd2_time_diff(ts, transaction->t_start); ts = jbd2_time_diff(ts, transaction->t_start);
...@@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle, ...@@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
tid_t tid; tid_t tid;
int needed, need_to_start; int needed, need_to_start;
int nblocks = handle->h_buffer_credits; int nblocks = handle->h_buffer_credits;
unsigned long ts = jiffies;
if (nblocks > journal->j_max_transaction_buffers) { if (nblocks > journal->j_max_transaction_buffers) {
printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
...@@ -271,7 +271,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle, ...@@ -271,7 +271,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
/* OK, account for the buffers that this operation expects to /* OK, account for the buffers that this operation expects to
* use and add the handle to the running transaction. * use and add the handle to the running transaction.
*/ */
update_t_max_wait(transaction); update_t_max_wait(transaction, ts);
handle->h_transaction = transaction; handle->h_transaction = transaction;
atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count); atomic_inc(&transaction->t_handle_count);
...@@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks) ...@@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks)
* This function is visible to journal users (like ext3fs), so is not * This function is visible to journal users (like ext3fs), so is not
* called with the journal already locked. * called with the journal already locked.
* *
* Return a pointer to a newly allocated handle, or NULL on failure * Return a pointer to a newly allocated handle, or an ERR_PTR() value
* on failure.
*/ */
handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
{ {
...@@ -921,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) ...@@ -921,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
*/ */
JBUFFER_TRACE(jh, "cancelling revoke"); JBUFFER_TRACE(jh, "cancelling revoke");
jbd2_journal_cancel_revoke(handle, jh); jbd2_journal_cancel_revoke(handle, jh);
jbd2_journal_put_journal_head(jh);
out: out:
jbd2_journal_put_journal_head(jh);
return err; return err;
} }
...@@ -2147,6 +2148,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) ...@@ -2147,6 +2148,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
jinode->i_next_transaction == transaction) jinode->i_next_transaction == transaction)
goto done; goto done;
/*
* We only ever set this variable to 1 so the test is safe. Since
* t_need_data_flush is likely to be set, we do the test to save some
* cacheline bouncing
*/
if (!transaction->t_need_data_flush)
transaction->t_need_data_flush = 1;
/* On some different transaction's list - should be /* On some different transaction's list - should be
* the committing one */ * the committing one */
if (jinode->i_transaction) { if (jinode->i_transaction) {
......
...@@ -529,9 +529,10 @@ struct transaction_s ...@@ -529,9 +529,10 @@ struct transaction_s
enum { enum {
T_RUNNING, T_RUNNING,
T_LOCKED, T_LOCKED,
T_RUNDOWN,
T_FLUSH, T_FLUSH,
T_COMMIT, T_COMMIT,
T_COMMIT_DFLUSH,
T_COMMIT_JFLUSH,
T_FINISHED T_FINISHED
} t_state; } t_state;
...@@ -658,7 +659,9 @@ struct transaction_s ...@@ -658,7 +659,9 @@ struct transaction_s
* waiting for it to finish. * waiting for it to finish.
*/ */
unsigned int t_synchronous_commit:1; unsigned int t_synchronous_commit:1;
unsigned int t_flushed_data_blocks:1;
/* Disk flush needs to be sent to fs partition [no locking] */
int t_need_data_flush;
/* /*
* For use by the filesystem to store fs-specific data * For use by the filesystem to store fs-specific data
...@@ -1228,6 +1231,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); ...@@ -1228,6 +1231,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
int jbd2_journal_force_commit_nested(journal_t *journal); int jbd2_journal_force_commit_nested(journal_t *journal);
int jbd2_log_wait_commit(journal_t *journal, tid_t tid); int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
int jbd2_log_do_checkpoint(journal_t *journal); int jbd2_log_do_checkpoint(journal_t *journal);
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
void __jbd2_log_wait_for_space(journal_t *journal); void __jbd2_log_wait_for_space(journal_t *journal);
extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment