Commit 35806b4f authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (61 commits)
  jbd2: Add MAINTAINERS entry
  jbd2: fix a potential leak of a journal_head on an error path
  ext4: teach ext4_ext_split to calculate extents efficiently
  ext4: Convert ext4 to new truncate calling convention
  ext4: do not normalize block requests from fallocate()
  ext4: enable "punch hole" functionality
  ext4: add "punch hole" flag to ext4_map_blocks()
  ext4: punch out extents
  ext4: add new function ext4_block_zero_page_range()
  ext4: add flag to ext4_has_free_blocks
  ext4: reserve inodes and feature code for 'quota' feature
  ext4: add support for multiple mount protection
  ext4: ensure f_bfree returned by ext4_statfs() is non-negative
  ext4: protect bb_first_free in ext4_trim_all_free() with group lock
  ext4: only load buddy bitmap in ext4_trim_fs() when it is needed
  jbd2: Fix comment to match the code in jbd2__journal_start()
  ext4: fix waiting and sending of a barrier in ext4_sync_file()
  jbd2: Add function jbd2_trans_will_send_data_barrier()
  jbd2: fix sending of data flush on journal commit
  ext4: fix ext4_ext_fiemap_cb() to handle blocks before request range correctly
  ...
parents 32e51f14 d183e11a
...@@ -226,10 +226,6 @@ acl Enables POSIX Access Control Lists support. ...@@ -226,10 +226,6 @@ acl Enables POSIX Access Control Lists support.
noacl This option disables POSIX Access Control List noacl This option disables POSIX Access Control List
support. support.
reservation
noreservation
bsddf (*) Make 'df' act like BSD. bsddf (*) Make 'df' act like BSD.
minixdf Make 'df' act like Minix. minixdf Make 'df' act like Minix.
......
...@@ -3572,9 +3572,16 @@ M: Andrew Morton <akpm@linux-foundation.org> ...@@ -3572,9 +3572,16 @@ M: Andrew Morton <akpm@linux-foundation.org>
M: Jan Kara <jack@suse.cz> M: Jan Kara <jack@suse.cz>
L: linux-ext4@vger.kernel.org L: linux-ext4@vger.kernel.org
S: Maintained S: Maintained
F: fs/jbd*/ F: fs/jbd/
F: include/linux/ext*jbd*.h F: include/linux/ext3_jbd.h
F: include/linux/jbd*.h F: include/linux/jbd.h
JOURNALLING LAYER FOR BLOCK DEVICES (JBD2)
M: "Theodore Ts'o" <tytso@mit.edu>
L: linux-ext4@vger.kernel.org
S: Maintained
F: fs/jbd2/
F: include/linux/jbd2.h
JSM Neo PCI based serial card JSM Neo PCI based serial card
M: Breno Leitao <leitao@linux.vnet.ibm.com> M: Breno Leitao <leitao@linux.vnet.ibm.com>
......
...@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ...@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
mmp.o
ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
......
...@@ -361,130 +361,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) ...@@ -361,130 +361,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
return bh; return bh;
} }
/**
* ext4_add_groupblocks() -- Add given blocks to an existing group
* @handle: handle to this transaction
* @sb: super block
* @block: start physcial block to add to the block group
* @count: number of blocks to free
*
* This marks the blocks as free in the bitmap. We ask the
* mballoc to reload the buddy after this by setting group
* EXT4_GROUP_INFO_NEED_INIT_BIT flag
*/
void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count)
{
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *gd_bh;
ext4_group_t block_group;
ext4_grpblk_t bit;
unsigned int i;
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int err = 0, ret, blk_free_count;
ext4_grpblk_t blocks_freed;
struct ext4_group_info *grp;
ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
grp = ext4_get_group_info(sb, block_group);
/*
* Check to see if we are freeing blocks across a group
* boundary.
*/
if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
goto error_return;
}
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
desc = ext4_get_group_desc(sb, block_group, &gd_bh);
if (!desc)
goto error_return;
if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
in_range(ext4_inode_bitmap(sb, desc), block, count) ||
in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
in_range(block + count - 1, ext4_inode_table(sb, desc),
sbi->s_itb_per_group)) {
ext4_error(sb, "Adding blocks in system zones - "
"Block = %llu, count = %lu",
block, count);
goto error_return;
}
/*
* We are about to add blocks to the bitmap,
* so we need undo access.
*/
BUFFER_TRACE(bitmap_bh, "getting undo access");
err = ext4_journal_get_undo_access(handle, bitmap_bh);
if (err)
goto error_return;
/*
* We are about to modify some metadata. Call the journal APIs
* to unshare ->b_data if a currently-committing transaction is
* using it
*/
BUFFER_TRACE(gd_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gd_bh);
if (err)
goto error_return;
/*
* make sure we don't allow a parallel init on other groups in the
* same buddy cache
*/
down_write(&grp->alloc_sem);
for (i = 0, blocks_freed = 0; i < count; i++) {
BUFFER_TRACE(bitmap_bh, "clear bit");
if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
bit + i, bitmap_bh->b_data)) {
ext4_error(sb, "bit already cleared for block %llu",
(ext4_fsblk_t)(block + i));
BUFFER_TRACE(bitmap_bh, "bit already cleared");
} else {
blocks_freed++;
}
}
ext4_lock_group(sb, block_group);
blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
ext4_free_blks_set(sb, desc, blk_free_count);
desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
atomic_add(blocks_freed,
&sbi->s_flex_groups[flex_group].free_blocks);
}
/*
* request to reload the buddy with the
* new bitmap information
*/
set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
grp->bb_free += blocks_freed;
up_write(&grp->alloc_sem);
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
/* And the group descriptor block */
BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
if (!err)
err = ret;
error_return:
brelse(bitmap_bh);
ext4_std_error(sb, err);
return;
}
/** /**
* ext4_has_free_blocks() * ext4_has_free_blocks()
* @sbi: in-core super block structure. * @sbi: in-core super block structure.
...@@ -493,7 +369,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ...@@ -493,7 +369,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
* Check if filesystem has nblocks free & available for allocation. * Check if filesystem has nblocks free & available for allocation.
* On success return 1, return 0 on failure. * On success return 1, return 0 on failure.
*/ */
static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
s64 nblocks, unsigned int flags)
{ {
s64 free_blocks, dirty_blocks, root_blocks; s64 free_blocks, dirty_blocks, root_blocks;
struct percpu_counter *fbc = &sbi->s_freeblocks_counter; struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
...@@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) ...@@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
EXT4_FREEBLOCKS_WATERMARK) { EXT4_FREEBLOCKS_WATERMARK) {
free_blocks = percpu_counter_sum_positive(fbc); free_blocks = percpu_counter_sum_positive(fbc);
dirty_blocks = percpu_counter_sum_positive(dbc); dirty_blocks = percpu_counter_sum_positive(dbc);
if (dirty_blocks < 0) {
printk(KERN_CRIT "Dirty block accounting "
"went wrong %lld\n",
(long long)dirty_blocks);
}
} }
/* Check whether we have space after /* Check whether we have space after
* accounting for current dirty blocks & root reserved blocks. * accounting for current dirty blocks & root reserved blocks.
...@@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) ...@@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
/* Hm, nope. Are (enough) root reserved blocks available? */ /* Hm, nope. Are (enough) root reserved blocks available? */
if (sbi->s_resuid == current_fsuid() || if (sbi->s_resuid == current_fsuid() ||
((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
capable(CAP_SYS_RESOURCE)) { capable(CAP_SYS_RESOURCE) ||
(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
if (free_blocks >= (nblocks + dirty_blocks)) if (free_blocks >= (nblocks + dirty_blocks))
return 1; return 1;
} }
...@@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) ...@@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
} }
int ext4_claim_free_blocks(struct ext4_sb_info *sbi, int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
s64 nblocks) s64 nblocks, unsigned int flags)
{ {
if (ext4_has_free_blocks(sbi, nblocks)) { if (ext4_has_free_blocks(sbi, nblocks, flags)) {
percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
return 0; return 0;
} else } else
...@@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi, ...@@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
*/ */
int ext4_should_retry_alloc(struct super_block *sb, int *retries) int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{ {
if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
(*retries)++ > 3 || (*retries)++ > 3 ||
!EXT4_SB(sb)->s_journal) !EXT4_SB(sb)->s_journal)
return 0; return 0;
...@@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) ...@@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* error stores in errp pointer * error stores in errp pointer
*/ */
ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp) ext4_fsblk_t goal, unsigned int flags,
unsigned long *count, int *errp)
{ {
struct ext4_allocation_request ar; struct ext4_allocation_request ar;
ext4_fsblk_t ret; ext4_fsblk_t ret;
...@@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ...@@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ar.inode = inode; ar.inode = inode;
ar.goal = goal; ar.goal = goal;
ar.len = count ? *count : 1; ar.len = count ? *count : 1;
ar.flags = flags;
ret = ext4_mb_new_blocks(handle, &ar, errp); ret = ext4_mb_new_blocks(handle, &ar, errp);
if (count) if (count)
......
...@@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t; ...@@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t;
#define EXT4_MB_DELALLOC_RESERVED 0x0400 #define EXT4_MB_DELALLOC_RESERVED 0x0400
/* We are doing stream allocation */ /* We are doing stream allocation */
#define EXT4_MB_STREAM_ALLOC 0x0800 #define EXT4_MB_STREAM_ALLOC 0x0800
/* Use reserved root blocks if needed */
#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
struct ext4_allocation_request { struct ext4_allocation_request {
/* target inode for block we're allocating */ /* target inode for block we're allocating */
...@@ -209,6 +210,8 @@ struct ext4_io_submit { ...@@ -209,6 +210,8 @@ struct ext4_io_submit {
*/ */
#define EXT4_BAD_INO 1 /* Bad blocks inode */ #define EXT4_BAD_INO 1 /* Bad blocks inode */
#define EXT4_ROOT_INO 2 /* Root inode */ #define EXT4_ROOT_INO 2 /* Root inode */
#define EXT4_USR_QUOTA_INO 3 /* User quota inode */
#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */
#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ #define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */
#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ #define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */
#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ #define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */
...@@ -512,6 +515,10 @@ struct ext4_new_group_data { ...@@ -512,6 +515,10 @@ struct ext4_new_group_data {
/* Convert extent to initialized after IO complete */ /* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
/* Punch out blocks of an extent */
#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
/* Don't normalize allocation size (used for fallocate) */
#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
/* /*
* Flags used by ext4_free_blocks * Flags used by ext4_free_blocks
...@@ -1028,7 +1035,7 @@ struct ext4_super_block { ...@@ -1028,7 +1035,7 @@ struct ext4_super_block {
__le16 s_want_extra_isize; /* New inodes should reserve # bytes */ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
__le32 s_flags; /* Miscellaneous flags */ __le32 s_flags; /* Miscellaneous flags */
__le16 s_raid_stride; /* RAID stride */ __le16 s_raid_stride; /* RAID stride */
__le16 s_mmp_interval; /* # seconds to wait in MMP checking */ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
__le64 s_mmp_block; /* Block for multi-mount protection */ __le64 s_mmp_block; /* Block for multi-mount protection */
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
__u8 s_log_groups_per_flex; /* FLEX_BG group size */ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
...@@ -1144,6 +1151,9 @@ struct ext4_sb_info { ...@@ -1144,6 +1151,9 @@ struct ext4_sb_info {
unsigned long s_ext_blocks; unsigned long s_ext_blocks;
unsigned long s_ext_extents; unsigned long s_ext_extents;
#endif #endif
/* ext4 extent cache stats */
unsigned long extent_cache_hits;
unsigned long extent_cache_misses;
/* for buddy allocator */ /* for buddy allocator */
struct ext4_group_info ***s_group_info; struct ext4_group_info ***s_group_info;
...@@ -1201,6 +1211,9 @@ struct ext4_sb_info { ...@@ -1201,6 +1211,9 @@ struct ext4_sb_info {
struct ext4_li_request *s_li_request; struct ext4_li_request *s_li_request;
/* Wait multiplier for lazy initialization thread */ /* Wait multiplier for lazy initialization thread */
unsigned int s_li_wait_mult; unsigned int s_li_wait_mult;
/* Kernel thread for multiple mount protection */
struct task_struct *s_mmp_tsk;
}; };
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
...@@ -1338,6 +1351,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) ...@@ -1338,6 +1351,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
...@@ -1351,13 +1365,29 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) ...@@ -1351,13 +1365,29 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_META_BG)
#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_RECOVER| \
EXT4_FEATURE_INCOMPAT_META_BG)
#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_RECOVER| \
EXT4_FEATURE_INCOMPAT_META_BG| \ EXT4_FEATURE_INCOMPAT_META_BG| \
EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_EXTENTS| \
EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_64BIT| \
EXT4_FEATURE_INCOMPAT_FLEX_BG) EXT4_FEATURE_INCOMPAT_FLEX_BG| \
EXT4_FEATURE_INCOMPAT_MMP)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
...@@ -1590,12 +1620,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ...@@ -1590,12 +1620,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
*/ */
struct ext4_lazy_init { struct ext4_lazy_init {
unsigned long li_state; unsigned long li_state;
wait_queue_head_t li_wait_daemon;
wait_queue_head_t li_wait_task;
struct timer_list li_timer;
struct task_struct *li_task;
struct list_head li_request_list; struct list_head li_request_list;
struct mutex li_list_mtx; struct mutex li_list_mtx;
}; };
...@@ -1614,6 +1638,67 @@ struct ext4_features { ...@@ -1614,6 +1638,67 @@ struct ext4_features {
struct completion f_kobj_unregister; struct completion f_kobj_unregister;
}; };
/*
* This structure will be used for multiple mount protection. It will be
* written into the block number saved in the s_mmp_block field in the
* superblock. Programs that check MMP should assume that if
* SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
* to use the filesystem, regardless of how old the timestamp is.
*/
#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */
#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */
#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */
struct mmp_struct {
__le32 mmp_magic; /* Magic number for MMP */
__le32 mmp_seq; /* Sequence no. updated periodically */
/*
* mmp_time, mmp_nodename & mmp_bdevname are only used for information
* purposes and do not affect the correctness of the algorithm
*/
__le64 mmp_time; /* Time last updated */
char mmp_nodename[64]; /* Node which last updated MMP block */
char mmp_bdevname[32]; /* Bdev which last updated MMP block */
/*
* mmp_check_interval is used to verify if the MMP block has been
* updated on the block device. The value is updated based on the
* maximum time to write the MMP block during an update cycle.
*/
__le16 mmp_check_interval;
__le16 mmp_pad1;
__le32 mmp_pad2[227];
};
/* arguments passed to the mmp thread */
struct mmpd_data {
struct buffer_head *bh; /* bh from initial read_mmp_block() */
struct super_block *sb; /* super block of the fs */
};
/*
* Check interval multiplier
* The MMP block is written every update interval and initially checked every
* update interval x the multiplier (the value is then adapted based on the
* write latency). The reason is that writes can be delayed under load and we
* don't want readers to incorrectly assume that the filesystem is no longer
* in use.
*/
#define EXT4_MMP_CHECK_MULT 2UL
/*
* Minimum interval for MMP checking in seconds.
*/
#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL
/*
* Maximum interval for MMP checking in seconds.
*/
#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL
/* /*
* Function prototypes * Function prototypes
*/ */
...@@ -1638,10 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); ...@@ -1638,10 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb, extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
ext4_group_t group); ext4_group_t group);
extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp); ext4_fsblk_t goal,
extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); unsigned int flags,
extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, unsigned long *count,
ext4_fsblk_t block, unsigned long count); int *errp);
extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
s64 nblocks, unsigned int flags);
extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
extern void ext4_check_blocks_bitmap(struct super_block *); extern void ext4_check_blocks_bitmap(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
...@@ -1706,6 +1793,8 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode, ...@@ -1706,6 +1793,8 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
unsigned long count, int flags); unsigned long count, int flags);
extern int ext4_mb_add_groupinfo(struct super_block *sb, extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc); ext4_group_t i, struct ext4_group_desc *desc);
extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
/* inode.c */ /* inode.c */
...@@ -1729,6 +1818,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); ...@@ -1729,6 +1818,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_can_truncate(struct inode *inode); extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *); extern void ext4_truncate(struct inode *);
extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *); extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *); extern void ext4_get_inode_flags(struct ext4_inode_info *);
...@@ -1738,6 +1828,8 @@ extern int ext4_writepage_trans_blocks(struct inode *); ...@@ -1738,6 +1828,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_block_truncate_page(handle_t *handle, extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from); struct address_space *mapping, loff_t from);
extern int ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode, extern void ext4_da_update_reserve_space(struct inode *inode,
...@@ -1788,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int, ...@@ -1788,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
__LINE__, ## message) __LINE__, ## message)
extern void ext4_msg(struct super_block *, const char *, const char *, ...) extern void ext4_msg(struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4))); __attribute__ ((format (printf, 3, 4)));
extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
const char *, unsigned int, const char *);
#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
__LINE__, msg)
extern void __ext4_grp_locked_error(const char *, unsigned int, \ extern void __ext4_grp_locked_error(const char *, unsigned int, \
struct super_block *, ext4_group_t, \ struct super_block *, ext4_group_t, \
unsigned long, ext4_fsblk_t, \ unsigned long, ext4_fsblk_t, \
...@@ -2064,6 +2160,8 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, ...@@ -2064,6 +2160,8 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags); struct ext4_map_blocks *map, int flags);
extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_truncate(struct inode *);
extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
loff_t length);
extern void ext4_ext_init(struct super_block *); extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *); extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset, extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
...@@ -2092,6 +2190,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, ...@@ -2092,6 +2190,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
int len, int len,
struct writeback_control *wbc); struct writeback_control *wbc);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
enum ext4_state_bits { enum ext4_state_bits {
BH_Uninit /* blocks are allocated but uninitialized on disk */ BH_Uninit /* blocks are allocated but uninitialized on disk */
......
...@@ -6,20 +6,6 @@ ...@@ -6,20 +6,6 @@
#include <trace/events/ext4.h> #include <trace/events/ext4.h>
int __ext4_journal_get_undo_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh)
{
int err = 0;
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_undo_access(handle, bh);
if (err)
ext4_journal_abort_handle(where, line, __func__, bh,
handle, err);
}
return err;
}
int __ext4_journal_get_write_access(const char *where, unsigned int line, int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh) handle_t *handle, struct buffer_head *bh)
{ {
......
...@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line, ...@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
const char *err_fn, const char *err_fn,
struct buffer_head *bh, handle_t *handle, int err); struct buffer_head *bh, handle_t *handle, int err);
int __ext4_journal_get_undo_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh);
int __ext4_journal_get_write_access(const char *where, unsigned int line, int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh); handle_t *handle, struct buffer_head *bh);
...@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, ...@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
int __ext4_handle_dirty_super(const char *where, unsigned int line, int __ext4_handle_dirty_super(const char *where, unsigned int line,
handle_t *handle, struct super_block *sb); handle_t *handle, struct super_block *sb);
#define ext4_journal_get_undo_access(handle, bh) \
__ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
#define ext4_journal_get_write_access(handle, bh) \ #define ext4_journal_get_write_access(handle, bh) \
__ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
......
...@@ -46,6 +46,13 @@ ...@@ -46,6 +46,13 @@
#include <trace/events/ext4.h> #include <trace/events/ext4.h>
static int ext4_split_extent(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path,
struct ext4_map_blocks *map,
int split_flag,
int flags);
static int ext4_ext_truncate_extend_restart(handle_t *handle, static int ext4_ext_truncate_extend_restart(handle_t *handle,
struct inode *inode, struct inode *inode,
int needed) int needed)
...@@ -192,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, ...@@ -192,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
static ext4_fsblk_t static ext4_fsblk_t
ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path, struct ext4_ext_path *path,
struct ext4_extent *ex, int *err) struct ext4_extent *ex, int *err, unsigned int flags)
{ {
ext4_fsblk_t goal, newblock; ext4_fsblk_t goal, newblock;
goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
NULL, err);
return newblock; return newblock;
} }
...@@ -474,9 +482,43 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) ...@@ -474,9 +482,43 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
} }
ext_debug("\n"); ext_debug("\n");
} }
static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
ext4_fsblk_t newblock, int level)
{
int depth = ext_depth(inode);
struct ext4_extent *ex;
if (depth != level) {
struct ext4_extent_idx *idx;
idx = path[level].p_idx;
while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
ext_debug("%d: move %d:%llu in new index %llu\n", level,
le32_to_cpu(idx->ei_block),
ext4_idx_pblock(idx),
newblock);
idx++;
}
return;
}
ex = path[depth].p_ext;
while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
le32_to_cpu(ex->ee_block),
ext4_ext_pblock(ex),
ext4_ext_is_uninitialized(ex),
ext4_ext_get_actual_len(ex),
newblock);
ex++;
}
}
#else #else
#define ext4_ext_show_path(inode, path) #define ext4_ext_show_path(inode, path)
#define ext4_ext_show_leaf(inode, path) #define ext4_ext_show_leaf(inode, path)
#define ext4_ext_show_move(inode, path, newblock, level)
#endif #endif
void ext4_ext_drop_refs(struct ext4_ext_path *path) void ext4_ext_drop_refs(struct ext4_ext_path *path)
...@@ -792,14 +834,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, ...@@ -792,14 +834,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
* - initializes subtree * - initializes subtree
*/ */
static int ext4_ext_split(handle_t *handle, struct inode *inode, static int ext4_ext_split(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path, unsigned int flags,
struct ext4_extent *newext, int at) struct ext4_ext_path *path,
struct ext4_extent *newext, int at)
{ {
struct buffer_head *bh = NULL; struct buffer_head *bh = NULL;
int depth = ext_depth(inode); int depth = ext_depth(inode);
struct ext4_extent_header *neh; struct ext4_extent_header *neh;
struct ext4_extent_idx *fidx; struct ext4_extent_idx *fidx;
struct ext4_extent *ex;
int i = at, k, m, a; int i = at, k, m, a;
ext4_fsblk_t newblock, oldblock; ext4_fsblk_t newblock, oldblock;
__le32 border; __le32 border;
...@@ -847,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ...@@ -847,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) { for (a = 0; a < depth - at; a++) {
newblock = ext4_ext_new_meta_block(handle, inode, path, newblock = ext4_ext_new_meta_block(handle, inode, path,
newext, &err); newext, &err, flags);
if (newblock == 0) if (newblock == 0)
goto cleanup; goto cleanup;
ablocks[a] = newblock; ablocks[a] = newblock;
...@@ -876,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ...@@ -876,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_depth = 0; neh->eh_depth = 0;
ex = EXT_FIRST_EXTENT(neh);
/* move remainder of path[depth] to the new leaf */ /* move remainder of path[depth] to the new leaf */
if (unlikely(path[depth].p_hdr->eh_entries != if (unlikely(path[depth].p_hdr->eh_entries !=
...@@ -888,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ...@@ -888,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
goto cleanup; goto cleanup;
} }
/* start copy from next extent */ /* start copy from next extent */
/* TODO: we could do it by single memmove */ m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
m = 0; ext4_ext_show_move(inode, path, newblock, depth);
path[depth].p_ext++;
while (path[depth].p_ext <=
EXT_MAX_EXTENT(path[depth].p_hdr)) {
ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
le32_to_cpu(path[depth].p_ext->ee_block),
ext4_ext_pblock(path[depth].p_ext),
ext4_ext_is_uninitialized(path[depth].p_ext),
ext4_ext_get_actual_len(path[depth].p_ext),
newblock);
/*memmove(ex++, path[depth].p_ext++,
sizeof(struct ext4_extent));
neh->eh_entries++;*/
path[depth].p_ext++;
m++;
}
if (m) { if (m) {
memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); struct ext4_extent *ex;
ex = EXT_FIRST_EXTENT(neh);
memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
le16_add_cpu(&neh->eh_entries, m); le16_add_cpu(&neh->eh_entries, m);
} }
...@@ -968,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ...@@ -968,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
ext_debug("int.index at %d (block %llu): %u -> %llu\n", ext_debug("int.index at %d (block %llu): %u -> %llu\n",
i, newblock, le32_to_cpu(border), oldblock); i, newblock, le32_to_cpu(border), oldblock);
/* copy indexes */
m = 0;
path[i].p_idx++;
ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, /* move remainder of path[i] to the new index block */
EXT_MAX_INDEX(path[i].p_hdr));
if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
EXT_LAST_INDEX(path[i].p_hdr))) { EXT_LAST_INDEX(path[i].p_hdr))) {
EXT4_ERROR_INODE(inode, EXT4_ERROR_INODE(inode,
...@@ -982,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ...@@ -982,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
err = -EIO; err = -EIO;
goto cleanup; goto cleanup;
} }
while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { /* start copy indexes */
ext_debug("%d: move %d:%llu in new index %llu\n", i, m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
le32_to_cpu(path[i].p_idx->ei_block), ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
ext4_idx_pblock(path[i].p_idx), EXT_MAX_INDEX(path[i].p_hdr));
newblock); ext4_ext_show_move(inode, path, newblock, i);
/*memmove(++fidx, path[i].p_idx++,
sizeof(struct ext4_extent_idx));
neh->eh_entries++;
BUG_ON(neh->eh_entries > neh->eh_max);*/
path[i].p_idx++;
m++;
}
if (m) { if (m) {
memmove(++fidx, path[i].p_idx - m, memmove(++fidx, path[i].p_idx,
sizeof(struct ext4_extent_idx) * m); sizeof(struct ext4_extent_idx) * m);
le16_add_cpu(&neh->eh_entries, m); le16_add_cpu(&neh->eh_entries, m);
} }
...@@ -1056,8 +1073,9 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ...@@ -1056,8 +1073,9 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
* just created block * just created block
*/ */
static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path, unsigned int flags,
struct ext4_extent *newext) struct ext4_ext_path *path,
struct ext4_extent *newext)
{ {
struct ext4_ext_path *curp = path; struct ext4_ext_path *curp = path;
struct ext4_extent_header *neh; struct ext4_extent_header *neh;
...@@ -1065,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ...@@ -1065,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
ext4_fsblk_t newblock; ext4_fsblk_t newblock;
int err = 0; int err = 0;
newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); newblock = ext4_ext_new_meta_block(handle, inode, path,
newext, &err, flags);
if (newblock == 0) if (newblock == 0)
return err; return err;
...@@ -1140,8 +1159,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ...@@ -1140,8 +1159,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
* if no free index is found, then it requests in-depth growing. * if no free index is found, then it requests in-depth growing.
*/ */
static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path, unsigned int flags,
struct ext4_extent *newext) struct ext4_ext_path *path,
struct ext4_extent *newext)
{ {
struct ext4_ext_path *curp; struct ext4_ext_path *curp;
int depth, i, err = 0; int depth, i, err = 0;
...@@ -1161,7 +1181,7 @@ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, ...@@ -1161,7 +1181,7 @@ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
if (EXT_HAS_FREE_INDEX(curp)) { if (EXT_HAS_FREE_INDEX(curp)) {
/* if we found index with free entry, then use that /* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */ * entry: create all needed subtree and add new leaf */
err = ext4_ext_split(handle, inode, path, newext, i); err = ext4_ext_split(handle, inode, flags, path, newext, i);
if (err) if (err)
goto out; goto out;
...@@ -1174,7 +1194,8 @@ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, ...@@ -1174,7 +1194,8 @@ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
err = PTR_ERR(path); err = PTR_ERR(path);
} else { } else {
/* tree is full, time to grow in depth */ /* tree is full, time to grow in depth */
err = ext4_ext_grow_indepth(handle, inode, path, newext); err = ext4_ext_grow_indepth(handle, inode, flags,
path, newext);
if (err) if (err)
goto out; goto out;
...@@ -1563,7 +1584,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, ...@@ -1563,7 +1584,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
* Returns 0 if the extents (ex and ex+1) were _not_ merged and returns * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
* 1 if they got merged. * 1 if they got merged.
*/ */
static int ext4_ext_try_to_merge(struct inode *inode, static int ext4_ext_try_to_merge_right(struct inode *inode,
struct ext4_ext_path *path, struct ext4_ext_path *path,
struct ext4_extent *ex) struct ext4_extent *ex)
{ {
...@@ -1602,6 +1623,31 @@ static int ext4_ext_try_to_merge(struct inode *inode, ...@@ -1602,6 +1623,31 @@ static int ext4_ext_try_to_merge(struct inode *inode,
return merge_done; return merge_done;
} }
/*
* This function tries to merge the @ex extent to neighbours in the tree.
* return 1 if merge left else 0.
*/
static int ext4_ext_try_to_merge(struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex) {
struct ext4_extent_header *eh;
unsigned int depth;
int merge_done = 0;
int ret = 0;
depth = ext_depth(inode);
BUG_ON(path[depth].p_hdr == NULL);
eh = path[depth].p_hdr;
if (ex > EXT_FIRST_EXTENT(eh))
merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
if (!merge_done)
ret = ext4_ext_try_to_merge_right(inode, path, ex);
return ret;
}
/* /*
* check if a portion of the "newext" extent overlaps with an * check if a portion of the "newext" extent overlaps with an
* existing extent. * existing extent.
...@@ -1668,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, ...@@ -1668,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
int depth, len, err; int depth, len, err;
ext4_lblk_t next; ext4_lblk_t next;
unsigned uninitialized = 0; unsigned uninitialized = 0;
int flags = 0;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
...@@ -1742,7 +1789,9 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, ...@@ -1742,7 +1789,9 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
* There is no free space in the found leaf. * There is no free space in the found leaf.
* We're gonna add a new leaf in the tree. * We're gonna add a new leaf in the tree.
*/ */
err = ext4_ext_create_new_leaf(handle, inode, path, newext); if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
flags = EXT4_MB_USE_ROOT_BLOCKS;
err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
if (err) if (err)
goto cleanup; goto cleanup;
depth = ext_depth(inode); depth = ext_depth(inode);
...@@ -2003,13 +2052,25 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, ...@@ -2003,13 +2052,25 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
} }
/* /*
* ext4_ext_in_cache()
* Checks to see if the given block is in the cache.
* If it is, the cached extent is stored in the given
* cache extent pointer. If the cached extent is a hole,
* this routine should be used instead of
* ext4_ext_in_cache if the calling function needs to
* know the size of the hole.
*
* @inode: The files inode
* @block: The block to look for in the cache
* @ex: Pointer where the cached extent will be stored
* if it contains block
*
* Return 0 if cache is invalid; 1 if the cache is valid * Return 0 if cache is invalid; 1 if the cache is valid
*/ */
static int static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, struct ext4_ext_cache *ex){
struct ext4_extent *ex)
{
struct ext4_ext_cache *cex; struct ext4_ext_cache *cex;
struct ext4_sb_info *sbi;
int ret = 0; int ret = 0;
/* /*
...@@ -2017,25 +2078,59 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, ...@@ -2017,25 +2078,59 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
*/ */
spin_lock(&EXT4_I(inode)->i_block_reservation_lock); spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
cex = &EXT4_I(inode)->i_cached_extent; cex = &EXT4_I(inode)->i_cached_extent;
sbi = EXT4_SB(inode->i_sb);
/* has cache valid data? */ /* has cache valid data? */
if (cex->ec_len == 0) if (cex->ec_len == 0)
goto errout; goto errout;
if (in_range(block, cex->ec_block, cex->ec_len)) { if (in_range(block, cex->ec_block, cex->ec_len)) {
ex->ee_block = cpu_to_le32(cex->ec_block); memcpy(ex, cex, sizeof(struct ext4_ext_cache));
ext4_ext_store_pblock(ex, cex->ec_start);
ex->ee_len = cpu_to_le16(cex->ec_len);
ext_debug("%u cached by %u:%u:%llu\n", ext_debug("%u cached by %u:%u:%llu\n",
block, block,
cex->ec_block, cex->ec_len, cex->ec_start); cex->ec_block, cex->ec_len, cex->ec_start);
ret = 1; ret = 1;
} }
errout: errout:
if (!ret)
sbi->extent_cache_misses++;
else
sbi->extent_cache_hits++;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
return ret; return ret;
} }
/*
* ext4_ext_in_cache()
* Checks to see if the given block is in the cache.
* If it is, the cached extent is stored in the given
* extent pointer.
*
* @inode: The files inode
* @block: The block to look for in the cache
* @ex: Pointer where the cached extent will be stored
* if it contains block
*
* Return 0 if cache is invalid; 1 if the cache is valid
*/
static int
ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
struct ext4_extent *ex)
{
struct ext4_ext_cache cex;
int ret = 0;
if (ext4_ext_check_cache(inode, block, &cex)) {
ex->ee_block = cpu_to_le32(cex.ec_block);
ext4_ext_store_pblock(ex, cex.ec_start);
ex->ee_len = cpu_to_le16(cex.ec_len);
ret = 1;
}
return ret;
}
/* /*
* ext4_ext_rm_idx: * ext4_ext_rm_idx:
* removes index from the index block. * removes index from the index block.
...@@ -2163,8 +2258,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, ...@@ -2163,8 +2258,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
ext4_free_blocks(handle, inode, NULL, start, num, flags); ext4_free_blocks(handle, inode, NULL, start, num, flags);
} else if (from == le32_to_cpu(ex->ee_block) } else if (from == le32_to_cpu(ex->ee_block)
&& to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", /* head removal */
from, to, le32_to_cpu(ex->ee_block), ee_len); ext4_lblk_t num;
ext4_fsblk_t start;
num = to - from;
start = ext4_ext_pblock(ex);
ext_debug("free first %u blocks starting %llu\n", num, start);
ext4_free_blocks(handle, inode, 0, start, num, flags);
} else { } else {
printk(KERN_INFO "strange request: removal(2) " printk(KERN_INFO "strange request: removal(2) "
"%u-%u from %u:%u\n", "%u-%u from %u:%u\n",
...@@ -2173,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, ...@@ -2173,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
return 0; return 0;
} }
/*
* ext4_ext_rm_leaf() Removes the extents associated with the
* blocks appearing between "start" and "end", and splits the extents
* if "start" and "end" appear in the same extent
*
* @handle: The journal handle
* @inode: The files inode
* @path: The path to the leaf
* @start: The first block to remove
* @end: The last block to remove
*/
static int static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path, ext4_lblk_t start) struct ext4_ext_path *path, ext4_lblk_t start,
ext4_lblk_t end)
{ {
int err = 0, correct_index = 0; int err = 0, correct_index = 0;
int depth = ext_depth(inode), credits; int depth = ext_depth(inode), credits;
...@@ -2186,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ...@@ -2186,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
unsigned short ex_ee_len; unsigned short ex_ee_len;
unsigned uninitialized = 0; unsigned uninitialized = 0;
struct ext4_extent *ex; struct ext4_extent *ex;
struct ext4_map_blocks map;
/* the header must be checked already in ext4_ext_remove_space() */ /* the header must be checked already in ext4_ext_remove_space() */
ext_debug("truncate since %u in leaf\n", start); ext_debug("truncate since %u in leaf\n", start);
...@@ -2215,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ...@@ -2215,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
path[depth].p_ext = ex; path[depth].p_ext = ex;
a = ex_ee_block > start ? ex_ee_block : start; a = ex_ee_block > start ? ex_ee_block : start;
b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? b = ex_ee_block+ex_ee_len - 1 < end ?
ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; ex_ee_block+ex_ee_len - 1 : end;
ext_debug(" border %u:%u\n", a, b); ext_debug(" border %u:%u\n", a, b);
if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { /* If this extent is beyond the end of the hole, skip it */
block = 0; if (end <= ex_ee_block) {
num = 0; ex--;
BUG(); ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
continue;
} else if (a != ex_ee_block &&
b != ex_ee_block + ex_ee_len - 1) {
/*
* If this is a truncate, then this condition should
* never happen because at least one of the end points
* needs to be on the edge of the extent.
*/
if (end == EXT_MAX_BLOCK) {
ext_debug(" bad truncate %u:%u\n",
start, end);
block = 0;
num = 0;
err = -EIO;
goto out;
}
/*
* else this is a hole punch, so the extent needs to
* be split since neither edge of the hole is on the
* extent edge
*/
else{
map.m_pblk = ext4_ext_pblock(ex);
map.m_lblk = ex_ee_block;
map.m_len = b - ex_ee_block;
err = ext4_split_extent(handle,
inode, path, &map, 0,
EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
EXT4_GET_BLOCKS_PRE_IO);
if (err < 0)
goto out;
ex_ee_len = ext4_ext_get_actual_len(ex);
b = ex_ee_block+ex_ee_len - 1 < end ?
ex_ee_block+ex_ee_len - 1 : end;
/* Then remove tail of this extent */
block = ex_ee_block;
num = a - block;
}
} else if (a != ex_ee_block) { } else if (a != ex_ee_block) {
/* remove tail of the extent */ /* remove tail of the extent */
block = ex_ee_block; block = ex_ee_block;
num = a - block; num = a - block;
} else if (b != ex_ee_block + ex_ee_len - 1) { } else if (b != ex_ee_block + ex_ee_len - 1) {
/* remove head of the extent */ /* remove head of the extent */
block = a; block = b;
num = b - a; num = ex_ee_block + ex_ee_len - b;
/* there is no "make a hole" API yet */
BUG(); /*
* If this is a truncate, this condition
* should never happen
*/
if (end == EXT_MAX_BLOCK) {
ext_debug(" bad truncate %u:%u\n",
start, end);
err = -EIO;
goto out;
}
} else { } else {
/* remove whole extent: excellent! */ /* remove whole extent: excellent! */
block = ex_ee_block; block = ex_ee_block;
num = 0; num = 0;
BUG_ON(a != ex_ee_block); if (a != ex_ee_block) {
BUG_ON(b != ex_ee_block + ex_ee_len - 1); ext_debug(" bad truncate %u:%u\n",
start, end);
err = -EIO;
goto out;
}
if (b != ex_ee_block + ex_ee_len - 1) {
ext_debug(" bad truncate %u:%u\n",
start, end);
err = -EIO;
goto out;
}
} }
/* /*
...@@ -2270,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ...@@ -2270,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (num == 0) { if (num == 0) {
/* this extent is removed; mark slot entirely unused */ /* this extent is removed; mark slot entirely unused */
ext4_ext_store_pblock(ex, 0); ext4_ext_store_pblock(ex, 0);
le16_add_cpu(&eh->eh_entries, -1); } else if (block != ex_ee_block) {
/*
* If this was a head removal, then we need to update
* the physical block since it is now at a different
* location
*/
ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
} }
ex->ee_block = cpu_to_le32(block); ex->ee_block = cpu_to_le32(block);
...@@ -2286,6 +2473,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ...@@ -2286,6 +2473,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (err) if (err)
goto out; goto out;
/*
* If the extent was completely released,
* we need to remove it from the leaf
*/
if (num == 0) {
if (end != EXT_MAX_BLOCK) {
/*
* For hole punching, we need to scoot all the
* extents up when an extent is removed so that
* we dont have blank extents in the middle
*/
memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
sizeof(struct ext4_extent));
/* Now get rid of the one at the end */
memset(EXT_LAST_EXTENT(eh), 0,
sizeof(struct ext4_extent));
}
le16_add_cpu(&eh->eh_entries, -1);
}
ext_debug("new extent: %u:%u:%llu\n", block, num, ext_debug("new extent: %u:%u:%llu\n", block, num,
ext4_ext_pblock(ex)); ext4_ext_pblock(ex));
ex--; ex--;
...@@ -2326,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) ...@@ -2326,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
return 1; return 1;
} }
static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end)
{ {
struct super_block *sb = inode->i_sb; struct super_block *sb = inode->i_sb;
int depth = ext_depth(inode); int depth = ext_depth(inode);
...@@ -2365,7 +2574,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) ...@@ -2365,7 +2574,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
while (i >= 0 && err == 0) { while (i >= 0 && err == 0) {
if (i == depth) { if (i == depth) {
/* this is leaf block */ /* this is leaf block */
err = ext4_ext_rm_leaf(handle, inode, path, start); err = ext4_ext_rm_leaf(handle, inode, path,
start, end);
/* root level has p_bh == NULL, brelse() eats this */ /* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh); brelse(path[i].p_bh);
path[i].p_bh = NULL; path[i].p_bh = NULL;
...@@ -2529,6 +2739,195 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) ...@@ -2529,6 +2739,195 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
return ret; return ret;
} }
/*
* used by extent splitting.
*/
#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
due to ENOSPC */
#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
/*
* ext4_split_extent_at() splits an extent at given block.
*
* @handle: the journal handle
* @inode: the file inode
* @path: the path to the extent
* @split: the logical block where the extent is splitted.
* @split_flags: indicates if the extent could be zeroout if split fails, and
* the states(init or uninit) of new extents.
* @flags: flags used to insert new extent to extent tree.
*
*
* Splits extent [a, b] into two extents [a, @split) and [@split, b], states
* of which are deterimined by split_flag.
*
* There are two cases:
* a> the extent are splitted into two extent.
* b> split is not needed, and just mark the extent.
*
* return 0 on success.
*/
static int ext4_split_extent_at(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t split,
int split_flag,
int flags)
{
ext4_fsblk_t newblock;
ext4_lblk_t ee_block;
struct ext4_extent *ex, newex, orig_ex;
struct ext4_extent *ex2 = NULL;
unsigned int ee_len, depth;
int err = 0;
ext_debug("ext4_split_extents_at: inode %lu, logical"
"block %llu\n", inode->i_ino, (unsigned long long)split);
ext4_ext_show_leaf(inode, path);
depth = ext_depth(inode);
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
newblock = split - ee_block + ext4_ext_pblock(ex);
BUG_ON(split < ee_block || split >= (ee_block + ee_len));
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
if (split == ee_block) {
/*
* case b: block @split is the block that the extent begins with
* then we just change the state of the extent, and splitting
* is not needed.
*/
if (split_flag & EXT4_EXT_MARK_UNINIT2)
ext4_ext_mark_uninitialized(ex);
else
ext4_ext_mark_initialized(ex);
if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
ext4_ext_try_to_merge(inode, path, ex);
err = ext4_ext_dirty(handle, inode, path + depth);
goto out;
}
/* case a */
memcpy(&orig_ex, ex, sizeof(orig_ex));
ex->ee_len = cpu_to_le16(split - ee_block);
if (split_flag & EXT4_EXT_MARK_UNINIT1)
ext4_ext_mark_uninitialized(ex);
/*
* path may lead to new leaf, not to original leaf any more
* after ext4_ext_insert_extent() returns,
*/
err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
goto fix_extent_len;
ex2 = &newex;
ex2->ee_block = cpu_to_le32(split);
ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
ext4_ext_store_pblock(ex2, newblock);
if (split_flag & EXT4_EXT_MARK_UNINIT2)
ext4_ext_mark_uninitialized(ex2);
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
/* update the extent length and mark as initialized */
ex->ee_len = cpu_to_le32(ee_len);
ext4_ext_try_to_merge(inode, path, ex);
err = ext4_ext_dirty(handle, inode, path + depth);
goto out;
} else if (err)
goto fix_extent_len;
out:
ext4_ext_show_leaf(inode, path);
return err;
fix_extent_len:
ex->ee_len = orig_ex.ee_len;
ext4_ext_dirty(handle, inode, path + depth);
return err;
}
/*
* ext4_split_extents() splits an extent and mark extent which is covered
* by @map as split_flags indicates
*
* It may result in splitting the extent into multiple extents (upto three)
* There are three possibilities:
* a> There is no split required
* b> Splits in two extents: Split is happening at either end of the extent
* c> Splits in three extents: Somone is splitting in middle of the extent
*
*/
static int ext4_split_extent(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path,
struct ext4_map_blocks *map,
int split_flag,
int flags)
{
ext4_lblk_t ee_block;
struct ext4_extent *ex;
unsigned int ee_len, depth;
int err = 0;
int uninitialized;
int split_flag1, flags1;
depth = ext_depth(inode);
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
uninitialized = ext4_ext_is_uninitialized(ex);
if (map->m_lblk + map->m_len < ee_block + ee_len) {
split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
EXT4_EXT_MAY_ZEROOUT : 0;
flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
if (uninitialized)
split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
EXT4_EXT_MARK_UNINIT2;
err = ext4_split_extent_at(handle, inode, path,
map->m_lblk + map->m_len, split_flag1, flags1);
if (err)
goto out;
}
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode, map->m_lblk, path);
if (IS_ERR(path))
return PTR_ERR(path);
if (map->m_lblk >= ee_block) {
split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
EXT4_EXT_MAY_ZEROOUT : 0;
if (uninitialized)
split_flag1 |= EXT4_EXT_MARK_UNINIT1;
if (split_flag & EXT4_EXT_MARK_UNINIT2)
split_flag1 |= EXT4_EXT_MARK_UNINIT2;
err = ext4_split_extent_at(handle, inode, path,
map->m_lblk, split_flag1, flags);
if (err)
goto out;
}
ext4_ext_show_leaf(inode, path);
out:
return err ? err : map->m_len;
}
#define EXT4_EXT_ZERO_LEN 7 #define EXT4_EXT_ZERO_LEN 7
/* /*
* This function is called by ext4_ext_map_blocks() if someone tries to write * This function is called by ext4_ext_map_blocks() if someone tries to write
...@@ -2545,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ...@@ -2545,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
struct ext4_map_blocks *map, struct ext4_map_blocks *map,
struct ext4_ext_path *path) struct ext4_ext_path *path)
{ {
struct ext4_extent *ex, newex, orig_ex; struct ext4_map_blocks split_map;
struct ext4_extent *ex1 = NULL; struct ext4_extent zero_ex;
struct ext4_extent *ex2 = NULL; struct ext4_extent *ex;
struct ext4_extent *ex3 = NULL;
struct ext4_extent_header *eh;
ext4_lblk_t ee_block, eof_block; ext4_lblk_t ee_block, eof_block;
unsigned int allocated, ee_len, depth; unsigned int allocated, ee_len, depth;
ext4_fsblk_t newblock;
int err = 0; int err = 0;
int ret = 0; int split_flag = 0;
int may_zeroout;
ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
"block %llu, max_blocks %u\n", inode->i_ino, "block %llu, max_blocks %u\n", inode->i_ino,
...@@ -2567,280 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ...@@ -2567,280 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
eof_block = map->m_lblk + map->m_len; eof_block = map->m_lblk + map->m_len;
depth = ext_depth(inode); depth = ext_depth(inode);
eh = path[depth].p_hdr;
ex = path[depth].p_ext; ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block); ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex); ee_len = ext4_ext_get_actual_len(ex);
allocated = ee_len - (map->m_lblk - ee_block); allocated = ee_len - (map->m_lblk - ee_block);
newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
ex2 = ex;
orig_ex.ee_block = ex->ee_block;
orig_ex.ee_len = cpu_to_le16(ee_len);
ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
WARN_ON(map->m_lblk < ee_block);
/* /*
* It is safe to convert extent to initialized via explicit * It is safe to convert extent to initialized via explicit
* zeroout only if extent is fully insde i_size or new_size. * zeroout only if extent is fully insde i_size or new_size.
*/ */
may_zeroout = ee_block + ee_len <= eof_block; split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
err = ext4_ext_zeroout(inode, &orig_ex); (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
err = ext4_ext_zeroout(inode, ex);
if (err) if (err)
goto fix_extent_len; goto out;
/* update the extent length and mark as initialized */
ex->ee_block = orig_ex.ee_block;
ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zeroed the full extent */
return allocated;
}
/* ex1: ee_block to map->m_lblk - 1 : uninitialized */ err = ext4_ext_get_access(handle, inode, path + depth);
if (map->m_lblk > ee_block) { if (err)
ex1 = ex; goto out;
ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); ext4_ext_mark_initialized(ex);
ext4_ext_mark_uninitialized(ex1); ext4_ext_try_to_merge(inode, path, ex);
ex2 = &newex; err = ext4_ext_dirty(handle, inode, path + depth);
goto out;
} }
/* /*
* for sanity, update the length of the ex2 extent before * four cases:
* we insert ex3, if ex1 is NULL. This is to avoid temporary * 1. split the extent into three extents.
* overlap of blocks. * 2. split the extent into two extents, zeroout the first half.
* 3. split the extent into two extents, zeroout the second half.
* 4. split the extent into two extents with out zeroout.
*/ */
if (!ex1 && allocated > map->m_len) split_map.m_lblk = map->m_lblk;
ex2->ee_len = cpu_to_le16(map->m_len); split_map.m_len = map->m_len;
/* ex3: to ee_block + ee_len : uninitialised */
if (allocated > map->m_len) {
unsigned int newdepth;
/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
/*
* map->m_lblk == ee_block is handled by the zerouout
* at the beginning.
* Mark first half uninitialized.
* Mark second half initialized and zero out the
* initialized extent
*/
ex->ee_block = orig_ex.ee_block;
ex->ee_len = cpu_to_le16(ee_len - allocated);
ext4_ext_mark_uninitialized(ex);
ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
ex3 = &newex;
ex3->ee_block = cpu_to_le32(map->m_lblk);
ext4_ext_store_pblock(ex3, newblock);
ex3->ee_len = cpu_to_le16(allocated);
err = ext4_ext_insert_extent(handle, inode, path,
ex3, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
ex->ee_block = orig_ex.ee_block;
ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex,
ext4_ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* blocks available from map->m_lblk */
return allocated;
} else if (err)
goto fix_extent_len;
/* if (allocated > map->m_len) {
* We need to zero out the second half because if (allocated <= EXT4_EXT_ZERO_LEN &&
* an fallocate request can update file size and (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
* converting the second half to initialized extent /* case 3 */
* implies that we can leak some junk data to user zero_ex.ee_block =
* space. cpu_to_le32(map->m_lblk);
*/ zero_ex.ee_len = cpu_to_le16(allocated);
err = ext4_ext_zeroout(inode, ex3); ext4_ext_store_pblock(&zero_ex,
if (err) { ext4_ext_pblock(ex) + map->m_lblk - ee_block);
/* err = ext4_ext_zeroout(inode, &zero_ex);
* We should actually mark the if (err)
* second half as uninit and return error goto out;
* Insert would have changed the extent split_map.m_lblk = map->m_lblk;
*/ split_map.m_len = allocated;
depth = ext_depth(inode); } else if ((map->m_lblk - ee_block + map->m_len <
ext4_ext_drop_refs(path); EXT4_EXT_ZERO_LEN) &&
path = ext4_ext_find_extent(inode, map->m_lblk, (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
path); /* case 2 */
if (IS_ERR(path)) { if (map->m_lblk != ee_block) {
err = PTR_ERR(path); zero_ex.ee_block = ex->ee_block;
return err; zero_ex.ee_len = cpu_to_le16(map->m_lblk -
} ee_block);
/* get the second half extent details */ ext4_ext_store_pblock(&zero_ex,
ex = path[depth].p_ext; ext4_ext_pblock(ex));
err = ext4_ext_get_access(handle, inode, err = ext4_ext_zeroout(inode, &zero_ex);
path + depth);
if (err) if (err)
return err; goto out;
ext4_ext_mark_uninitialized(ex);
ext4_ext_dirty(handle, inode, path + depth);
return err;
} }
/* zeroed the second half */ split_map.m_lblk = ee_block;
return allocated; split_map.m_len = map->m_lblk - ee_block + map->m_len;
} allocated = map->m_len;
ex3 = &newex;
ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
ext4_ext_store_pblock(ex3, newblock + map->m_len);
ex3->ee_len = cpu_to_le16(allocated - map->m_len);
ext4_ext_mark_uninitialized(ex3);
err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
/* update the extent length and mark as initialized */
ex->ee_block = orig_ex.ee_block;
ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zeroed the full extent */
/* blocks available from map->m_lblk */
return allocated;
} else if (err)
goto fix_extent_len;
/*
* The depth, and hence eh & ex might change
* as part of the insert above.
*/
newdepth = ext_depth(inode);
/*
* update the extent length after successful insert of the
* split extent
*/
ee_len -= ext4_ext_get_actual_len(ex3);
orig_ex.ee_len = cpu_to_le16(ee_len);
may_zeroout = ee_block + ee_len <= eof_block;
depth = newdepth;
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode, map->m_lblk, path);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
} }
eh = path[depth].p_hdr; }
ex = path[depth].p_ext;
if (ex2 != &newex)
ex2 = ex;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
allocated = map->m_len; allocated = ext4_split_extent(handle, inode, path,
&split_map, split_flag, 0);
if (allocated < 0)
err = allocated;
/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
* to insert a extent in the middle zerout directly
* otherwise give the extent a chance to merge to left
*/
if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
map->m_lblk != ee_block && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
/* update the extent length and mark as initialized */
ex->ee_block = orig_ex.ee_block;
ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zero out the first half */
/* blocks available from map->m_lblk */
return allocated;
}
}
/*
* If there was a change of depth as part of the
* insertion of ex3 above, we need to update the length
* of the ex1 extent again here
*/
if (ex1 && ex1 != ex) {
ex1 = ex;
ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
ext4_ext_mark_uninitialized(ex1);
ex2 = &newex;
}
/* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
ex2->ee_block = cpu_to_le32(map->m_lblk);
ext4_ext_store_pblock(ex2, newblock);
ex2->ee_len = cpu_to_le16(allocated);
if (ex2 != ex)
goto insert;
/*
* New (initialized) extent starts from the first block
* in the current extent. i.e., ex2 == ex
* We have to see if it can be merged with the extent
* on the left.
*/
if (ex2 > EXT_FIRST_EXTENT(eh)) {
/*
* To merge left, pass "ex2 - 1" to try_to_merge(),
* since it merges towards right _only_.
*/
ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
if (ret) {
err = ext4_ext_correct_indexes(handle, inode, path);
if (err)
goto out;
depth = ext_depth(inode);
ex2--;
}
}
/*
* Try to Merge towards right. This might be required
* only when the whole extent is being written to.
* i.e. ex2 == ex and ex3 == NULL.
*/
if (!ex3) {
ret = ext4_ext_try_to_merge(inode, path, ex2);
if (ret) {
err = ext4_ext_correct_indexes(handle, inode, path);
if (err)
goto out;
}
}
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + depth);
goto out;
insert:
err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
/* update the extent length and mark as initialized */
ex->ee_block = orig_ex.ee_block;
ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zero out the first half */
return allocated;
} else if (err)
goto fix_extent_len;
out: out:
ext4_ext_show_leaf(inode, path);
return err ? err : allocated; return err ? err : allocated;
fix_extent_len:
ex->ee_block = orig_ex.ee_block;
ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
ext4_ext_mark_uninitialized(ex);
ext4_ext_dirty(handle, inode, path + depth);
return err;
} }
/* /*
...@@ -2871,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle, ...@@ -2871,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle,
struct ext4_ext_path *path, struct ext4_ext_path *path,
int flags) int flags)
{ {
struct ext4_extent *ex, newex, orig_ex; ext4_lblk_t eof_block;
struct ext4_extent *ex1 = NULL; ext4_lblk_t ee_block;
struct ext4_extent *ex2 = NULL; struct ext4_extent *ex;
struct ext4_extent *ex3 = NULL; unsigned int ee_len;
ext4_lblk_t ee_block, eof_block; int split_flag = 0, depth;
unsigned int allocated, ee_len, depth;
ext4_fsblk_t newblock;
int err = 0;
int may_zeroout;
ext_debug("ext4_split_unwritten_extents: inode %lu, logical" ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
"block %llu, max_blocks %u\n", inode->i_ino, "block %llu, max_blocks %u\n", inode->i_ino,
...@@ -2889,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle, ...@@ -2889,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle,
inode->i_sb->s_blocksize_bits; inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map->m_len) if (eof_block < map->m_lblk + map->m_len)
eof_block = map->m_lblk + map->m_len; eof_block = map->m_lblk + map->m_len;
depth = ext_depth(inode);
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
allocated = ee_len - (map->m_lblk - ee_block);
newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
ex2 = ex;
orig_ex.ee_block = ex->ee_block;
orig_ex.ee_len = cpu_to_le16(ee_len);
ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
/* /*
* It is safe to convert extent to initialized via explicit * It is safe to convert extent to initialized via explicit
* zeroout only if extent is fully insde i_size or new_size. * zeroout only if extent is fully insde i_size or new_size.
*/ */
may_zeroout = ee_block + ee_len <= eof_block; depth = ext_depth(inode);
ex = path[depth].p_ext;
/* ee_block = le32_to_cpu(ex->ee_block);
* If the uninitialized extent begins at the same logical ee_len = ext4_ext_get_actual_len(ex);
* block where the write begins, and the write completely
* covers the extent, then we don't need to split it.
*/
if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
return allocated;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
if (map->m_lblk > ee_block) {
ex1 = ex;
ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
ext4_ext_mark_uninitialized(ex1);
ex2 = &newex;
}
/*
* for sanity, update the length of the ex2 extent before
* we insert ex3, if ex1 is NULL. This is to avoid temporary
* overlap of blocks.
*/
if (!ex1 && allocated > map->m_len)
ex2->ee_len = cpu_to_le16(map->m_len);
/* ex3: to ee_block + ee_len : uninitialised */
if (allocated > map->m_len) {
unsigned int newdepth;
ex3 = &newex;
ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
ext4_ext_store_pblock(ex3, newblock + map->m_len);
ex3->ee_len = cpu_to_le16(allocated - map->m_len);
ext4_ext_mark_uninitialized(ex3);
err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
/* update the extent length and mark as initialized */
ex->ee_block = orig_ex.ee_block;
ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zeroed the full extent */
/* blocks available from map->m_lblk */
return allocated;
} else if (err)
goto fix_extent_len;
/*
* The depth, and hence eh & ex might change
* as part of the insert above.
*/
newdepth = ext_depth(inode);
/*
* update the extent length after successful insert of the
* split extent
*/
ee_len -= ext4_ext_get_actual_len(ex3);
orig_ex.ee_len = cpu_to_le16(ee_len);
may_zeroout = ee_block + ee_len <= eof_block;
depth = newdepth;
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode, map->m_lblk, path);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
}
ex = path[depth].p_ext;
if (ex2 != &newex)
ex2 = ex;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
allocated = map->m_len; split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
} split_flag |= EXT4_EXT_MARK_UNINIT2;
/*
* If there was a change of depth as part of the
* insertion of ex3 above, we need to update the length
* of the ex1 extent again here
*/
if (ex1 && ex1 != ex) {
ex1 = ex;
ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
ext4_ext_mark_uninitialized(ex1);
ex2 = &newex;
}
/*
* ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
* using direct I/O, uninitialised still.
*/
ex2->ee_block = cpu_to_le32(map->m_lblk);
ext4_ext_store_pblock(ex2, newblock);
ex2->ee_len = cpu_to_le16(allocated);
ext4_ext_mark_uninitialized(ex2);
if (ex2 != ex)
goto insert;
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + depth);
ext_debug("out here\n");
goto out;
insert:
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
/* update the extent length and mark as initialized */
ex->ee_block = orig_ex.ee_block;
ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zero out the first half */
return allocated;
} else if (err)
goto fix_extent_len;
out:
ext4_ext_show_leaf(inode, path);
return err ? err : allocated;
fix_extent_len: flags |= EXT4_GET_BLOCKS_PRE_IO;
ex->ee_block = orig_ex.ee_block; return ext4_split_extent(handle, inode, path, map, split_flag, flags);
ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
ext4_ext_mark_uninitialized(ex);
ext4_ext_dirty(handle, inode, path + depth);
return err;
} }
static int ext4_convert_unwritten_extents_endio(handle_t *handle, static int ext4_convert_unwritten_extents_endio(handle_t *handle,
struct inode *inode, struct inode *inode,
struct ext4_ext_path *path) struct ext4_ext_path *path)
...@@ -3047,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, ...@@ -3047,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
struct ext4_extent_header *eh; struct ext4_extent_header *eh;
int depth; int depth;
int err = 0; int err = 0;
int ret = 0;
depth = ext_depth(inode); depth = ext_depth(inode);
eh = path[depth].p_hdr; eh = path[depth].p_hdr;
ex = path[depth].p_ext; ex = path[depth].p_ext;
ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
"block %llu, max_blocks %u\n", inode->i_ino,
(unsigned long long)le32_to_cpu(ex->ee_block),
ext4_ext_get_actual_len(ex));
err = ext4_ext_get_access(handle, inode, path + depth); err = ext4_ext_get_access(handle, inode, path + depth);
if (err) if (err)
goto out; goto out;
/* first mark the extent as initialized */ /* first mark the extent as initialized */
ext4_ext_mark_initialized(ex); ext4_ext_mark_initialized(ex);
/* /* note: ext4_ext_correct_indexes() isn't needed here because
* We have to see if it can be merged with the extent * borders are not changed
* on the left.
*/
if (ex > EXT_FIRST_EXTENT(eh)) {
/*
* To merge left, pass "ex - 1" to try_to_merge(),
* since it merges towards right _only_.
*/
ret = ext4_ext_try_to_merge(inode, path, ex - 1);
if (ret) {
err = ext4_ext_correct_indexes(handle, inode, path);
if (err)
goto out;
depth = ext_depth(inode);
ex--;
}
}
/*
* Try to Merge towards right.
*/ */
ret = ext4_ext_try_to_merge(inode, path, ex); ext4_ext_try_to_merge(inode, path, ex);
if (ret) {
err = ext4_ext_correct_indexes(handle, inode, path);
if (err)
goto out;
depth = ext_depth(inode);
}
/* Mark modified extent as dirty */ /* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + depth); err = ext4_ext_dirty(handle, inode, path + depth);
out: out:
...@@ -3302,15 +3346,19 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ...@@ -3302,15 +3346,19 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t newblock = 0; ext4_fsblk_t newblock = 0;
int err = 0, depth, ret; int err = 0, depth, ret;
unsigned int allocated = 0; unsigned int allocated = 0;
unsigned int punched_out = 0;
unsigned int result = 0;
struct ext4_allocation_request ar; struct ext4_allocation_request ar;
ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
struct ext4_map_blocks punch_map;
ext_debug("blocks %u/%u requested for inode %lu\n", ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino); map->m_lblk, map->m_len, inode->i_ino);
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* check in cache */ /* check in cache */
if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
if (!newex.ee_start_lo && !newex.ee_start_hi) { if (!newex.ee_start_lo && !newex.ee_start_hi) {
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
/* /*
...@@ -3375,16 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ...@@ -3375,16 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
ee_block, ee_len, newblock); ee_block, ee_len, newblock);
/* Do not put uninitialized extent in the cache */ if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
if (!ext4_ext_is_uninitialized(ex)) { /*
ext4_ext_put_in_cache(inode, ee_block, * Do not put uninitialized extent
ee_len, ee_start); * in the cache
goto out; */
if (!ext4_ext_is_uninitialized(ex)) {
ext4_ext_put_in_cache(inode, ee_block,
ee_len, ee_start);
goto out;
}
ret = ext4_ext_handle_uninitialized_extents(
handle, inode, map, path, flags,
allocated, newblock);
return ret;
} }
ret = ext4_ext_handle_uninitialized_extents(handle,
inode, map, path, flags, allocated, /*
newblock); * Punch out the map length, but only to the
return ret; * end of the extent
*/
punched_out = allocated < map->m_len ?
allocated : map->m_len;
/*
* Sense extents need to be converted to
* uninitialized, they must fit in an
* uninitialized extent
*/
if (punched_out > EXT_UNINIT_MAX_LEN)
punched_out = EXT_UNINIT_MAX_LEN;
punch_map.m_lblk = map->m_lblk;
punch_map.m_pblk = newblock;
punch_map.m_len = punched_out;
punch_map.m_flags = 0;
/* Check to see if the extent needs to be split */
if (punch_map.m_len != ee_len ||
punch_map.m_lblk != ee_block) {
ret = ext4_split_extent(handle, inode,
path, &punch_map, 0,
EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
EXT4_GET_BLOCKS_PRE_IO);
if (ret < 0) {
err = ret;
goto out2;
}
/*
* find extent for the block at
* the start of the hole
*/
ext4_ext_drop_refs(path);
kfree(path);
path = ext4_ext_find_extent(inode,
map->m_lblk, NULL);
if (IS_ERR(path)) {
err = PTR_ERR(path);
path = NULL;
goto out2;
}
depth = ext_depth(inode);
ex = path[depth].p_ext;
ee_len = ext4_ext_get_actual_len(ex);
ee_block = le32_to_cpu(ex->ee_block);
ee_start = ext4_ext_pblock(ex);
}
ext4_ext_mark_uninitialized(ex);
err = ext4_ext_remove_space(inode, map->m_lblk,
map->m_lblk + punched_out);
goto out2;
} }
} }
...@@ -3446,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ...@@ -3446,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
else else
/* disable in-core preallocation for non-regular files */ /* disable in-core preallocation for non-regular files */
ar.flags = 0; ar.flags = 0;
if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
ar.flags |= EXT4_MB_HINT_NOPREALLOC;
newblock = ext4_mb_new_blocks(handle, &ar, &err); newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock) if (!newblock)
goto out2; goto out2;
...@@ -3529,7 +3647,11 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ...@@ -3529,7 +3647,11 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
} }
trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
newblock, map->m_len, err ? err : allocated); newblock, map->m_len, err ? err : allocated);
return err ? err : allocated;
result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
punched_out : allocated;
return err ? err : result;
} }
void ext4_ext_truncate(struct inode *inode) void ext4_ext_truncate(struct inode *inode)
...@@ -3577,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode) ...@@ -3577,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode)
last_block = (inode->i_size + sb->s_blocksize - 1) last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb); >> EXT4_BLOCK_SIZE_BITS(sb);
err = ext4_ext_remove_space(inode, last_block); err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK);
/* In a multi-transaction truncate, we only make the final /* In a multi-transaction truncate, we only make the final
* transaction synchronous. * transaction synchronous.
...@@ -3585,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode) ...@@ -3585,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode)
if (IS_SYNC(inode)) if (IS_SYNC(inode))
ext4_handle_sync(handle); ext4_handle_sync(handle);
out_stop:
up_write(&EXT4_I(inode)->i_data_sem); up_write(&EXT4_I(inode)->i_data_sem);
out_stop:
/* /*
* If this was a simple ftruncate() and the file will remain alive, * If this was a simple ftruncate() and the file will remain alive,
* then we need to clear up the orphan record which we created above. * then we need to clear up the orphan record which we created above.
...@@ -3651,10 +3774,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) ...@@ -3651,10 +3774,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
struct ext4_map_blocks map; struct ext4_map_blocks map;
unsigned int credits, blkbits = inode->i_blkbits; unsigned int credits, blkbits = inode->i_blkbits;
/* We only support the FALLOC_FL_KEEP_SIZE mode */
if (mode & ~FALLOC_FL_KEEP_SIZE)
return -EOPNOTSUPP;
/* /*
* currently supporting (pre)allocate mode for extent-based * currently supporting (pre)allocate mode for extent-based
* files _only_ * files _only_
...@@ -3662,6 +3781,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) ...@@ -3662,6 +3781,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP; return -EOPNOTSUPP;
/* Return error if mode is not supported */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
return ext4_punch_hole(file, offset, len);
trace_ext4_fallocate_enter(inode, offset, len, mode); trace_ext4_fallocate_enter(inode, offset, len, mode);
map.m_lblk = offset >> blkbits; map.m_lblk = offset >> blkbits;
/* /*
...@@ -3691,7 +3817,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) ...@@ -3691,7 +3817,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
break; break;
} }
ret = ext4_map_blocks(handle, inode, &map, ret = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
EXT4_GET_BLOCKS_NO_NORMALIZE);
if (ret <= 0) { if (ret <= 0) {
#ifdef EXT4FS_DEBUG #ifdef EXT4FS_DEBUG
WARN_ON(ret <= 0); WARN_ON(ret <= 0);
...@@ -3822,6 +3949,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, ...@@ -3822,6 +3949,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
pgoff_t last_offset; pgoff_t last_offset;
pgoff_t offset; pgoff_t offset;
pgoff_t index; pgoff_t index;
pgoff_t start_index = 0;
struct page **pages = NULL; struct page **pages = NULL;
struct buffer_head *bh = NULL; struct buffer_head *bh = NULL;
struct buffer_head *head = NULL; struct buffer_head *head = NULL;
...@@ -3848,39 +3976,57 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, ...@@ -3848,39 +3976,57 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
kfree(pages); kfree(pages);
return EXT_CONTINUE; return EXT_CONTINUE;
} }
index = 0;
next_page:
/* Try to find the 1st mapped buffer. */ /* Try to find the 1st mapped buffer. */
end = ((__u64)pages[0]->index << PAGE_SHIFT) >> end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
blksize_bits; blksize_bits;
if (!page_has_buffers(pages[0])) if (!page_has_buffers(pages[index]))
goto out; goto out;
head = page_buffers(pages[0]); head = page_buffers(pages[index]);
if (!head) if (!head)
goto out; goto out;
index++;
bh = head; bh = head;
do { do {
if (buffer_mapped(bh)) { if (end >= newex->ec_block +
newex->ec_len)
/* The buffer is out of
* the request range.
*/
goto out;
if (buffer_mapped(bh) &&
end >= newex->ec_block) {
start_index = index - 1;
/* get the 1st mapped buffer. */ /* get the 1st mapped buffer. */
if (end > newex->ec_block +
newex->ec_len)
/* The buffer is out of
* the request range.
*/
goto out;
goto found_mapped_buffer; goto found_mapped_buffer;
} }
bh = bh->b_this_page; bh = bh->b_this_page;
end++; end++;
} while (bh != head); } while (bh != head);
/* No mapped buffer found. */ /* No mapped buffer in the range found in this page,
goto out; * We need to look up next page.
*/
if (index >= ret) {
/* There is no page left, but we need to limit
* newex->ec_len.
*/
newex->ec_len = end - newex->ec_block;
goto out;
}
goto next_page;
} else { } else {
/*Find contiguous delayed buffers. */ /*Find contiguous delayed buffers. */
if (ret > 0 && pages[0]->index == last_offset) if (ret > 0 && pages[0]->index == last_offset)
head = page_buffers(pages[0]); head = page_buffers(pages[0]);
bh = head; bh = head;
index = 1;
start_index = 0;
} }
found_mapped_buffer: found_mapped_buffer:
...@@ -3903,7 +4049,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, ...@@ -3903,7 +4049,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
end++; end++;
} while (bh != head); } while (bh != head);
for (index = 1; index < ret; index++) { for (; index < ret; index++) {
if (!page_has_buffers(pages[index])) { if (!page_has_buffers(pages[index])) {
bh = NULL; bh = NULL;
break; break;
...@@ -3913,8 +4059,10 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, ...@@ -3913,8 +4059,10 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
bh = NULL; bh = NULL;
break; break;
} }
if (pages[index]->index != if (pages[index]->index !=
pages[0]->index + index) { pages[start_index]->index + index
- start_index) {
/* Blocks are not contiguous. */ /* Blocks are not contiguous. */
bh = NULL; bh = NULL;
break; break;
...@@ -4006,6 +4154,177 @@ static int ext4_xattr_fiemap(struct inode *inode, ...@@ -4006,6 +4154,177 @@ static int ext4_xattr_fiemap(struct inode *inode,
return (error < 0 ? error : 0); return (error < 0 ? error : 0);
} }
/*
* ext4_ext_punch_hole
*
* Punches a hole of "length" bytes in a file starting
* at byte "offset"
*
* @inode: The inode of the file to punch a hole in
* @offset: The starting byte offset of the hole
* @length: The length of the hole
*
* Returns the number of blocks removed or negative on err
*/
int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct super_block *sb = inode->i_sb;
struct ext4_ext_cache cache_ex;
ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
struct address_space *mapping = inode->i_mapping;
struct ext4_map_blocks map;
handle_t *handle;
loff_t first_block_offset, last_block_offset, block_len;
loff_t first_page, last_page, first_page_offset, last_page_offset;
int ret, credits, blocks_released, err = 0;
first_block = (offset + sb->s_blocksize - 1) >>
EXT4_BLOCK_SIZE_BITS(sb);
last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
last_page = (offset + length) >> PAGE_CACHE_SHIFT;
first_page_offset = first_page << PAGE_CACHE_SHIFT;
last_page_offset = last_page << PAGE_CACHE_SHIFT;
/*
* Write out all dirty pages to avoid race conditions
* Then release them.
*/
if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
err = filemap_write_and_wait_range(mapping,
first_page_offset == 0 ? 0 : first_page_offset-1,
last_page_offset);
if (err)
return err;
}
/* Now release the pages */
if (last_page_offset > first_page_offset) {
truncate_inode_pages_range(mapping, first_page_offset,
last_page_offset-1);
}
/* finish any pending end_io work */
ext4_flush_completed_IO(inode);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
err = ext4_orphan_add(handle, inode);
if (err)
goto out;
/*
* Now we need to zero out the un block aligned data.
* If the file is smaller than a block, just
* zero out the middle
*/
if (first_block > last_block)
ext4_block_zero_page_range(handle, mapping, offset, length);
else {
/* zero out the head of the hole before the first block */
block_len = first_block_offset - offset;
if (block_len > 0)
ext4_block_zero_page_range(handle, mapping,
offset, block_len);
/* zero out the tail of the hole after the last block */
block_len = offset + length - last_block_offset;
if (block_len > 0) {
ext4_block_zero_page_range(handle, mapping,
last_block_offset, block_len);
}
}
/* If there are no blocks to remove, return now */
if (first_block >= last_block)
goto out;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_ext_invalidate_cache(inode);
ext4_discard_preallocations(inode);
/*
* Loop over all the blocks and identify blocks
* that need to be punched out
*/
iblock = first_block;
blocks_released = 0;
while (iblock < last_block) {
max_blocks = last_block - iblock;
num_blocks = 1;
memset(&map, 0, sizeof(map));
map.m_lblk = iblock;
map.m_len = max_blocks;
ret = ext4_ext_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
if (ret > 0) {
blocks_released += ret;
num_blocks = ret;
} else if (ret == 0) {
/*
* If map blocks could not find the block,
* then it is in a hole. If the hole was
* not already cached, then map blocks should
* put it in the cache. So we can get the hole
* out of the cache
*/
memset(&cache_ex, 0, sizeof(cache_ex));
if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
!cache_ex.ec_start) {
/* The hole is cached */
num_blocks = cache_ex.ec_block +
cache_ex.ec_len - iblock;
} else {
/* The block could not be identified */
err = -EIO;
break;
}
} else {
/* Map blocks error */
err = ret;
break;
}
if (num_blocks == 0) {
/* This condition should never happen */
ext_debug("Block lookup failed");
err = -EIO;
break;
}
iblock += num_blocks;
}
if (blocks_released > 0) {
ext4_ext_invalidate_cache(inode);
ext4_discard_preallocations(inode);
}
if (IS_SYNC(inode))
ext4_handle_sync(handle);
up_write(&EXT4_I(inode)->i_data_sem);
out:
ext4_orphan_del(handle, inode);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
return err;
}
int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len) __u64 start, __u64 len)
{ {
...@@ -4042,4 +4361,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ...@@ -4042,4 +4361,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return error; return error;
} }
...@@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = { ...@@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = {
}; };
const struct inode_operations ext4_file_inode_operations = { const struct inode_operations ext4_file_inode_operations = {
.truncate = ext4_truncate,
.setattr = ext4_setattr, .setattr = ext4_setattr,
.getattr = ext4_getattr, .getattr = ext4_getattr,
#ifdef CONFIG_EXT4_FS_XATTR #ifdef CONFIG_EXT4_FS_XATTR
......
...@@ -36,7 +36,7 @@ ...@@ -36,7 +36,7 @@
static void dump_completed_IO(struct inode * inode) static void dump_completed_IO(struct inode * inode)
{ {
#ifdef EXT4_DEBUG #ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after; struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1; ext4_io_end_t *io, *io0, *io1;
unsigned long flags; unsigned long flags;
...@@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync) ...@@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync)
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret; int ret;
tid_t commit_tid; tid_t commit_tid;
bool needs_barrier = false;
J_ASSERT(ext4_journal_current_handle() == NULL); J_ASSERT(ext4_journal_current_handle() == NULL);
...@@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync) ...@@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync)
} }
commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
if (jbd2_log_start_commit(journal, commit_tid)) { if (journal->j_flags & JBD2_BARRIER &&
/* !jbd2_trans_will_send_data_barrier(journal, commit_tid))
* When the journal is on a different device than the needs_barrier = true;
* fs data disk, we need to issue the barrier in jbd2_log_start_commit(journal, commit_tid);
* writeback mode. (In ordered mode, the jbd2 layer ret = jbd2_log_wait_commit(journal, commit_tid);
* will take care of issuing the barrier. In if (needs_barrier)
* data=journal, all of the data blocks are written to
* the journal device.)
*/
if (ext4_should_writeback_data(inode) &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
NULL);
ret = jbd2_log_wait_commit(journal, commit_tid);
} else if (journal->j_flags & JBD2_BARRIER)
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
out: out:
trace_ext4_sync_file_exit(inode, ret); trace_ext4_sync_file_exit(inode, ret);
......
...@@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, ...@@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
while (target > 0) { while (target > 0) {
count = target; count = target;
/* allocating blocks for indirect blocks and direct blocks */ /* allocating blocks for indirect blocks and direct blocks */
current_block = ext4_new_meta_blocks(handle, inode, current_block = ext4_new_meta_blocks(handle, inode, goal,
goal, &count, err); 0, &count, err);
if (*err) if (*err)
goto failed_out; goto failed_out;
...@@ -1930,7 +1930,7 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) ...@@ -1930,7 +1930,7 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
* We do still charge estimated metadata to the sb though; * We do still charge estimated metadata to the sb though;
* we cannot afford to run out of free blocks. * we cannot afford to run out of free blocks.
*/ */
if (ext4_claim_free_blocks(sbi, md_needed + 1)) { if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
dquot_release_reservation_block(inode, 1); dquot_release_reservation_block(inode, 1);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) { if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield(); yield();
...@@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping, ...@@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping,
continue; continue;
} }
if (PageWriteback(page)) wait_on_page_writeback(page);
wait_on_page_writeback(page);
BUG_ON(PageWriteback(page)); BUG_ON(PageWriteback(page));
if (mpd->next_page != page->index) if (mpd->next_page != page->index)
...@@ -3513,7 +3511,7 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, ...@@ -3513,7 +3511,7 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
loff_t end = offset + iov_length(iov, nr_segs); loff_t end = offset + iov_length(iov, nr_segs);
if (end > isize) if (end > isize)
vmtruncate(inode, isize); ext4_truncate_failed_write(inode);
} }
} }
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
...@@ -3915,10 +3913,31 @@ void ext4_set_aops(struct inode *inode) ...@@ -3915,10 +3913,31 @@ void ext4_set_aops(struct inode *inode)
*/ */
int ext4_block_truncate_page(handle_t *handle, int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from) struct address_space *mapping, loff_t from)
{
unsigned offset = from & (PAGE_CACHE_SIZE-1);
unsigned length;
unsigned blocksize;
struct inode *inode = mapping->host;
blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1));
return ext4_block_zero_page_range(handle, mapping, from, length);
}
/*
* ext4_block_zero_page_range() zeros out a mapping of length 'length'
* starting from file offset 'from'. The range to be zero'd must
* be contained with in one block. If the specified range exceeds
* the end of the block it will be shortened to end of the block
* that cooresponds to 'from'
*/
int ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{ {
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1); unsigned offset = from & (PAGE_CACHE_SIZE-1);
unsigned blocksize, length, pos; unsigned blocksize, max, pos;
ext4_lblk_t iblock; ext4_lblk_t iblock;
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
struct buffer_head *bh; struct buffer_head *bh;
...@@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle, ...@@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
return -EINVAL; return -EINVAL;
blocksize = inode->i_sb->s_blocksize; blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1)); max = blocksize - (offset & (blocksize - 1));
/*
* correct length if it does not fall between
* 'from' and the end of the block
*/
if (length > max || length < 0)
length = max;
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
if (!page_has_buffers(page)) if (!page_has_buffers(page))
...@@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, ...@@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
int ext4_can_truncate(struct inode *inode) int ext4_can_truncate(struct inode *inode)
{ {
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return 0;
if (S_ISREG(inode->i_mode)) if (S_ISREG(inode->i_mode))
return 1; return 1;
if (S_ISDIR(inode->i_mode)) if (S_ISDIR(inode->i_mode))
...@@ -4391,6 +4416,31 @@ int ext4_can_truncate(struct inode *inode) ...@@ -4391,6 +4416,31 @@ int ext4_can_truncate(struct inode *inode)
return 0; return 0;
} }
/*
* ext4_punch_hole: punches a hole in a file by releaseing the blocks
* associated with the given offset and length
*
* @inode: File inode
* @offset: The offset where the hole will begin
* @len: The length of the hole
*
* Returns: 0 on sucess or negative on failure
*/
int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
struct inode *inode = file->f_path.dentry->d_inode;
if (!S_ISREG(inode->i_mode))
return -ENOTSUPP;
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
/* TODO: Add support for non extent hole punching */
return -ENOTSUPP;
}
return ext4_ext_punch_hole(file, offset, length);
}
/* /*
* ext4_truncate() * ext4_truncate()
* *
...@@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode, ...@@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
/* /*
* Figure out the offset within the block group inode table * Figure out the offset within the block group inode table
*/ */
inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
inode_offset = ((inode->i_ino - 1) % inode_offset = ((inode->i_ino - 1) %
EXT4_INODES_PER_GROUP(sb)); EXT4_INODES_PER_GROUP(sb));
block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
...@@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (S_ISREG(inode->i_mode) && if (S_ISREG(inode->i_mode) &&
attr->ia_valid & ATTR_SIZE && attr->ia_valid & ATTR_SIZE &&
(attr->ia_size < inode->i_size || (attr->ia_size < inode->i_size)) {
(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
handle_t *handle; handle_t *handle;
handle = ext4_journal_start(inode, 3); handle = ext4_journal_start(inode, 3);
...@@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
goto err_out; goto err_out;
} }
} }
/* ext4_truncate will clear the flag */
if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
ext4_truncate(inode);
} }
if ((attr->ia_valid & ATTR_SIZE) && if (attr->ia_valid & ATTR_SIZE) {
attr->ia_size != i_size_read(inode)) if (attr->ia_size != i_size_read(inode)) {
rc = vmtruncate(inode, attr->ia_size); truncate_setsize(inode, attr->ia_size);
ext4_truncate(inode);
} else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
ext4_truncate(inode);
}
if (!rc) { if (!rc) {
setattr_copy(inode, attr); setattr_copy(inode, attr);
...@@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out_unlock; goto out_unlock;
} }
ret = 0; ret = 0;
if (PageMappedToDisk(page))
goto out_unlock; lock_page(page);
wait_on_page_writeback(page);
if (PageMappedToDisk(page)) {
up_read(&inode->i_alloc_sem);
return VM_FAULT_LOCKED;
}
if (page->index == size >> PAGE_CACHE_SHIFT) if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK; len = size & ~PAGE_CACHE_MASK;
else else
len = PAGE_CACHE_SIZE; len = PAGE_CACHE_SIZE;
lock_page(page);
/* /*
* return if we have all the buffers mapped. This avoid * return if we have all the buffers mapped. This avoid
* the need to call write_begin/write_end which does a * the need to call write_begin/write_end which does a
...@@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (page_has_buffers(page)) { if (page_has_buffers(page)) {
if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
ext4_bh_unmapped)) { ext4_bh_unmapped)) {
unlock_page(page); up_read(&inode->i_alloc_sem);
goto out_unlock; return VM_FAULT_LOCKED;
} }
} }
unlock_page(page); unlock_page(page);
...@@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (ret < 0) if (ret < 0)
goto out_unlock; goto out_unlock;
ret = 0; ret = 0;
/*
* write_begin/end might have created a dirty page and someone
* could wander in and start the IO. Make sure that hasn't
* happened.
*/
lock_page(page);
wait_on_page_writeback(page);
up_read(&inode->i_alloc_sem);
return VM_FAULT_LOCKED;
out_unlock: out_unlock:
if (ret) if (ret)
ret = VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS;
......
...@@ -787,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) ...@@ -787,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
struct inode *inode; struct inode *inode;
char *data; char *data;
char *bitmap; char *bitmap;
struct ext4_group_info *grinfo;
mb_debug(1, "init page %lu\n", page->index); mb_debug(1, "init page %lu\n", page->index);
...@@ -819,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore) ...@@ -819,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (first_group + i >= ngroups) if (first_group + i >= ngroups)
break; break;
grinfo = ext4_get_group_info(sb, first_group + i);
/*
* If page is uptodate then we came here after online resize
* which added some new uninitialized group info structs, so
* we must skip all initialized uptodate buddies on the page,
* which may be currently in use by an allocating task.
*/
if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
bh[i] = NULL;
continue;
}
err = -EIO; err = -EIO;
desc = ext4_get_group_desc(sb, first_group + i, NULL); desc = ext4_get_group_desc(sb, first_group + i, NULL);
if (desc == NULL) if (desc == NULL)
...@@ -871,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore) ...@@ -871,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
} }
/* wait for I/O completion */ /* wait for I/O completion */
for (i = 0; i < groups_per_page && bh[i]; i++) for (i = 0; i < groups_per_page; i++)
wait_on_buffer(bh[i]); if (bh[i])
wait_on_buffer(bh[i]);
err = -EIO; err = -EIO;
for (i = 0; i < groups_per_page && bh[i]; i++) for (i = 0; i < groups_per_page; i++)
if (!buffer_uptodate(bh[i])) if (bh[i] && !buffer_uptodate(bh[i]))
goto out; goto out;
err = 0; err = 0;
first_block = page->index * blocks_per_page; first_block = page->index * blocks_per_page;
/* init the page */
memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
for (i = 0; i < blocks_per_page; i++) { for (i = 0; i < blocks_per_page; i++) {
int group; int group;
struct ext4_group_info *grinfo;
group = (first_block + i) >> 1; group = (first_block + i) >> 1;
if (group >= ngroups) if (group >= ngroups)
break; break;
if (!bh[group - first_group])
/* skip initialized uptodate buddy */
continue;
/* /*
* data carry information regarding this * data carry information regarding this
* particular group in the format specified * particular group in the format specified
...@@ -919,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore) ...@@ -919,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
* incore got set to the group block bitmap below * incore got set to the group block bitmap below
*/ */
ext4_lock_group(sb, group); ext4_lock_group(sb, group);
/* init the buddy */
memset(data, 0xff, blocksize);
ext4_mb_generate_buddy(sb, data, incore, group); ext4_mb_generate_buddy(sb, data, incore, group);
ext4_unlock_group(sb, group); ext4_unlock_group(sb, group);
incore = NULL; incore = NULL;
...@@ -948,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) ...@@ -948,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
out: out:
if (bh) { if (bh) {
for (i = 0; i < groups_per_page && bh[i]; i++) for (i = 0; i < groups_per_page; i++)
brelse(bh[i]); brelse(bh[i]);
if (bh != &bhs) if (bh != &bhs)
kfree(bh); kfree(bh);
...@@ -957,22 +974,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore) ...@@ -957,22 +974,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
} }
/* /*
* lock the group_info alloc_sem of all the groups * Lock the buddy and bitmap pages. This make sure other parallel init_group
* belonging to the same buddy cache page. This * on the same buddy page doesn't happen whild holding the buddy page lock.
* make sure other parallel operation on the buddy * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
* cache doesn't happen whild holding the buddy cache * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
* lock
*/ */
static int ext4_mb_get_buddy_cache_lock(struct super_block *sb, static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
ext4_group_t group) ext4_group_t group, struct ext4_buddy *e4b)
{ {
int i; struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
int block, pnum; int block, pnum, poff;
int blocks_per_page; int blocks_per_page;
int groups_per_page; struct page *page;
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t first_group; e4b->bd_buddy_page = NULL;
struct ext4_group_info *grp; e4b->bd_bitmap_page = NULL;
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
/* /*
...@@ -982,57 +998,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ...@@ -982,57 +998,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
*/ */
block = group * 2; block = group * 2;
pnum = block / blocks_per_page; pnum = block / blocks_per_page;
first_group = pnum * blocks_per_page / 2; poff = block % blocks_per_page;
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
groups_per_page = blocks_per_page >> 1; if (!page)
if (groups_per_page == 0) return -EIO;
groups_per_page = 1; BUG_ON(page->mapping != inode->i_mapping);
/* read all groups the page covers into the cache */ e4b->bd_bitmap_page = page;
for (i = 0; i < groups_per_page; i++) { e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
if ((first_group + i) >= ngroups) if (blocks_per_page >= 2) {
break; /* buddy and bitmap are on the same page */
grp = ext4_get_group_info(sb, first_group + i); return 0;
/* take all groups write allocation
* semaphore. This make sure there is
* no block allocation going on in any
* of that groups
*/
down_write_nested(&grp->alloc_sem, i);
} }
return i;
block++;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (!page)
return -EIO;
BUG_ON(page->mapping != inode->i_mapping);
e4b->bd_buddy_page = page;
return 0;
} }
static void ext4_mb_put_buddy_cache_lock(struct super_block *sb, static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
ext4_group_t group, int locked_group)
{ {
int i; if (e4b->bd_bitmap_page) {
int block, pnum; unlock_page(e4b->bd_bitmap_page);
int blocks_per_page; page_cache_release(e4b->bd_bitmap_page);
ext4_group_t first_group; }
struct ext4_group_info *grp; if (e4b->bd_buddy_page) {
unlock_page(e4b->bd_buddy_page);
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; page_cache_release(e4b->bd_buddy_page);
/*
* the buddy cache inode stores the block bitmap
* and buddy information in consecutive blocks.
* So for each group we need two blocks.
*/
block = group * 2;
pnum = block / blocks_per_page;
first_group = pnum * blocks_per_page / 2;
/* release locks on all the groups */
for (i = 0; i < locked_group; i++) {
grp = ext4_get_group_info(sb, first_group + i);
/* take all groups write allocation
* semaphore. This make sure there is
* no block allocation going on in any
* of that groups
*/
up_write(&grp->alloc_sem);
} }
} }
/* /*
...@@ -1044,93 +1043,60 @@ static noinline_for_stack ...@@ -1044,93 +1043,60 @@ static noinline_for_stack
int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
{ {
int ret = 0;
void *bitmap;
int blocks_per_page;
int block, pnum, poff;
int num_grp_locked = 0;
struct ext4_group_info *this_grp; struct ext4_group_info *this_grp;
struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b;
struct inode *inode = sbi->s_buddy_cache; struct page *page;
struct page *page = NULL, *bitmap_page = NULL; int ret = 0;
mb_debug(1, "init group %u\n", group); mb_debug(1, "init group %u\n", group);
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
this_grp = ext4_get_group_info(sb, group); this_grp = ext4_get_group_info(sb, group);
/* /*
* This ensures that we don't reinit the buddy cache * This ensures that we don't reinit the buddy cache
* page which map to the group from which we are already * page which map to the group from which we are already
* allocating. If we are looking at the buddy cache we would * allocating. If we are looking at the buddy cache we would
* have taken a reference using ext4_mb_load_buddy and that * have taken a reference using ext4_mb_load_buddy and that
* would have taken the alloc_sem lock. * would have pinned buddy page to page cache.
*/ */
num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
/* /*
* somebody initialized the group * somebody initialized the group
* return without doing anything * return without doing anything
*/ */
ret = 0;
goto err; goto err;
} }
/*
* the buddy cache inode stores the block bitmap page = e4b.bd_bitmap_page;
* and buddy information in consecutive blocks. ret = ext4_mb_init_cache(page, NULL);
* So for each group we need two blocks. if (ret)
*/ goto err;
block = group * 2; if (!PageUptodate(page)) {
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
ret = ext4_mb_init_cache(page, NULL);
if (ret) {
unlock_page(page);
goto err;
}
unlock_page(page);
}
if (page == NULL || !PageUptodate(page)) {
ret = -EIO; ret = -EIO;
goto err; goto err;
} }
mark_page_accessed(page); mark_page_accessed(page);
bitmap_page = page;
bitmap = page_address(page) + (poff * sb->s_blocksize);
/* init buddy cache */ if (e4b.bd_buddy_page == NULL) {
block++;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (page == bitmap_page) {
/* /*
* If both the bitmap and buddy are in * If both the bitmap and buddy are in
* the same page we don't need to force * the same page we don't need to force
* init the buddy * init the buddy
*/ */
unlock_page(page); ret = 0;
} else if (page) { goto err;
BUG_ON(page->mapping != inode->i_mapping);
ret = ext4_mb_init_cache(page, bitmap);
if (ret) {
unlock_page(page);
goto err;
}
unlock_page(page);
} }
if (page == NULL || !PageUptodate(page)) { /* init buddy cache */
page = e4b.bd_buddy_page;
ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
if (ret)
goto err;
if (!PageUptodate(page)) {
ret = -EIO; ret = -EIO;
goto err; goto err;
} }
mark_page_accessed(page); mark_page_accessed(page);
err: err:
ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); ext4_mb_put_buddy_page_lock(&e4b);
if (bitmap_page)
page_cache_release(bitmap_page);
if (page)
page_cache_release(page);
return ret; return ret;
} }
...@@ -1164,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, ...@@ -1164,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
e4b->bd_group = group; e4b->bd_group = group;
e4b->bd_buddy_page = NULL; e4b->bd_buddy_page = NULL;
e4b->bd_bitmap_page = NULL; e4b->bd_bitmap_page = NULL;
e4b->alloc_semp = &grp->alloc_sem;
/* Take the read lock on the group alloc
* sem. This would make sure a parallel
* ext4_mb_init_group happening on other
* groups mapped by the page is blocked
* till we are done with allocation
*/
repeat_load_buddy:
down_read(e4b->alloc_semp);
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
/* we need to check for group need init flag
* with alloc_semp held so that we can be sure
* that new blocks didn't get added to the group
* when we are loading the buddy cache
*/
up_read(e4b->alloc_semp);
/* /*
* we need full data about the group * we need full data about the group
* to make a good selection * to make a good selection
...@@ -1189,7 +1139,6 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, ...@@ -1189,7 +1139,6 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
ret = ext4_mb_init_group(sb, group); ret = ext4_mb_init_group(sb, group);
if (ret) if (ret)
return ret; return ret;
goto repeat_load_buddy;
} }
/* /*
...@@ -1273,15 +1222,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, ...@@ -1273,15 +1222,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
return 0; return 0;
err: err:
if (page)
page_cache_release(page);
if (e4b->bd_bitmap_page) if (e4b->bd_bitmap_page)
page_cache_release(e4b->bd_bitmap_page); page_cache_release(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page) if (e4b->bd_buddy_page)
page_cache_release(e4b->bd_buddy_page); page_cache_release(e4b->bd_buddy_page);
e4b->bd_buddy = NULL; e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL; e4b->bd_bitmap = NULL;
/* Done with the buddy cache */
up_read(e4b->alloc_semp);
return ret; return ret;
} }
...@@ -1291,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) ...@@ -1291,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
page_cache_release(e4b->bd_bitmap_page); page_cache_release(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page) if (e4b->bd_buddy_page)
page_cache_release(e4b->bd_buddy_page); page_cache_release(e4b->bd_buddy_page);
/* Done with the buddy cache */
if (e4b->alloc_semp)
up_read(e4b->alloc_semp);
} }
...@@ -1606,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, ...@@ -1606,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
get_page(ac->ac_bitmap_page); get_page(ac->ac_bitmap_page);
ac->ac_buddy_page = e4b->bd_buddy_page; ac->ac_buddy_page = e4b->bd_buddy_page;
get_page(ac->ac_buddy_page); get_page(ac->ac_buddy_page);
/* on allocation we use ac to track the held semaphore */
ac->alloc_semp = e4b->alloc_semp;
e4b->alloc_semp = NULL;
/* store last allocated for subsequent stream allocation */ /* store last allocated for subsequent stream allocation */
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
spin_lock(&sbi->s_md_lock); spin_lock(&sbi->s_md_lock);
...@@ -2659,7 +2601,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) ...@@ -2659,7 +2601,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
struct super_block *sb = journal->j_private; struct super_block *sb = journal->j_private;
struct ext4_buddy e4b; struct ext4_buddy e4b;
struct ext4_group_info *db; struct ext4_group_info *db;
int err, ret, count = 0, count2 = 0; int err, count = 0, count2 = 0;
struct ext4_free_data *entry; struct ext4_free_data *entry;
struct list_head *l, *ltmp; struct list_head *l, *ltmp;
...@@ -2669,15 +2611,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) ...@@ -2669,15 +2611,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
mb_debug(1, "gonna free %u blocks in group %u (0x%p):", mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
entry->count, entry->group, entry); entry->count, entry->group, entry);
if (test_opt(sb, DISCARD)) { if (test_opt(sb, DISCARD))
ret = ext4_issue_discard(sb, entry->group, ext4_issue_discard(sb, entry->group,
entry->start_blk, entry->count); entry->start_blk, entry->count);
if (unlikely(ret == -EOPNOTSUPP)) {
ext4_warning(sb, "discard not supported, "
"disabling");
clear_opt(sb, DISCARD);
}
}
err = ext4_mb_load_buddy(sb, entry->group, &e4b); err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */ /* we expect to find existing buddy because it's pinned */
...@@ -4226,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) ...@@ -4226,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
spin_unlock(&pa->pa_lock); spin_unlock(&pa->pa_lock);
} }
} }
if (ac->alloc_semp)
up_read(ac->alloc_semp);
if (pa) { if (pa) {
/* /*
* We want to add the pa to the right bucket. * We want to add the pa to the right bucket.
* Remove it from the list and while adding * Remove it from the list and while adding
* make sure the list to which we are adding * make sure the list to which we are adding
* doesn't grow big. We need to release * doesn't grow big.
* alloc_semp before calling ext4_mb_add_n_trim()
*/ */
if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
spin_lock(pa->pa_obj_lock); spin_lock(pa->pa_obj_lock);
...@@ -4303,7 +4236,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ...@@ -4303,7 +4236,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
* there is enough free blocks to do block allocation * there is enough free blocks to do block allocation
* and verify allocation doesn't exceed the quota limits. * and verify allocation doesn't exceed the quota limits.
*/ */
while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { while (ar->len &&
ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
/* let others to free the space */ /* let others to free the space */
yield(); yield();
ar->len = ar->len >> 1; ar->len = ar->len >> 1;
...@@ -4313,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ...@@ -4313,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
return 0; return 0;
} }
reserv_blks = ar->len; reserv_blks = ar->len;
while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
ar->flags |= EXT4_MB_HINT_NOPREALLOC; dquot_alloc_block_nofail(ar->inode, ar->len);
ar->len--; } else {
while (ar->len &&
dquot_alloc_block(ar->inode, ar->len)) {
ar->flags |= EXT4_MB_HINT_NOPREALLOC;
ar->len--;
}
} }
inquota = ar->len; inquota = ar->len;
if (ar->len == 0) { if (ar->len == 0) {
...@@ -4703,6 +4644,127 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, ...@@ -4703,6 +4644,127 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
return; return;
} }
/**
* ext4_add_groupblocks() -- Add given blocks to an existing group
* @handle: handle to this transaction
* @sb: super block
* @block: start physcial block to add to the block group
* @count: number of blocks to free
*
* This marks the blocks as free in the bitmap and buddy.
*/
void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count)
{
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *gd_bh;
ext4_group_t block_group;
ext4_grpblk_t bit;
unsigned int i;
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_buddy e4b;
int err = 0, ret, blk_free_count;
ext4_grpblk_t blocks_freed;
struct ext4_group_info *grp;
ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
grp = ext4_get_group_info(sb, block_group);
/*
* Check to see if we are freeing blocks across a group
* boundary.
*/
if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
goto error_return;
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
desc = ext4_get_group_desc(sb, block_group, &gd_bh);
if (!desc)
goto error_return;
if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
in_range(ext4_inode_bitmap(sb, desc), block, count) ||
in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
in_range(block + count - 1, ext4_inode_table(sb, desc),
sbi->s_itb_per_group)) {
ext4_error(sb, "Adding blocks in system zones - "
"Block = %llu, count = %lu",
block, count);
goto error_return;
}
BUFFER_TRACE(bitmap_bh, "getting write access");
err = ext4_journal_get_write_access(handle, bitmap_bh);
if (err)
goto error_return;
/*
* We are about to modify some metadata. Call the journal APIs
* to unshare ->b_data if a currently-committing transaction is
* using it
*/
BUFFER_TRACE(gd_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gd_bh);
if (err)
goto error_return;
for (i = 0, blocks_freed = 0; i < count; i++) {
BUFFER_TRACE(bitmap_bh, "clear bit");
if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
ext4_error(sb, "bit already cleared for block %llu",
(ext4_fsblk_t)(block + i));
BUFFER_TRACE(bitmap_bh, "bit already cleared");
} else {
blocks_freed++;
}
}
err = ext4_mb_load_buddy(sb, block_group, &e4b);
if (err)
goto error_return;
/*
* need to update group_info->bb_free and bitmap
* with group lock held. generate_buddy look at
* them with group lock_held
*/
ext4_lock_group(sb, block_group);
mb_clear_bits(bitmap_bh->b_data, bit, count);
mb_free_blocks(NULL, &e4b, bit, count);
blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
ext4_free_blks_set(sb, desc, blk_free_count);
desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
atomic_add(blocks_freed,
&sbi->s_flex_groups[flex_group].free_blocks);
}
ext4_mb_unload_buddy(&e4b);
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
/* And the group descriptor block */
BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
if (!err)
err = ret;
error_return:
brelse(bitmap_bh);
ext4_std_error(sb, err);
return;
}
/** /**
* ext4_trim_extent -- function to TRIM one single free extent in the group * ext4_trim_extent -- function to TRIM one single free extent in the group
* @sb: super block for the file system * @sb: super block for the file system
...@@ -4715,11 +4777,10 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, ...@@ -4715,11 +4777,10 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
* one will allocate those blocks, mark it as used in buddy bitmap. This must * one will allocate those blocks, mark it as used in buddy bitmap. This must
* be called with under the group lock. * be called with under the group lock.
*/ */
static int ext4_trim_extent(struct super_block *sb, int start, int count, static void ext4_trim_extent(struct super_block *sb, int start, int count,
ext4_group_t group, struct ext4_buddy *e4b) ext4_group_t group, struct ext4_buddy *e4b)
{ {
struct ext4_free_extent ex; struct ext4_free_extent ex;
int ret = 0;
assert_spin_locked(ext4_group_lock_ptr(sb, group)); assert_spin_locked(ext4_group_lock_ptr(sb, group));
...@@ -4733,12 +4794,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count, ...@@ -4733,12 +4794,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
*/ */
mb_mark_used(e4b, &ex); mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group); ext4_unlock_group(sb, group);
ext4_issue_discard(sb, group, start, count);
ret = ext4_issue_discard(sb, group, start, count);
ext4_lock_group(sb, group); ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len); mb_free_blocks(NULL, e4b, start, ex.fe_len);
return ret;
} }
/** /**
...@@ -4760,21 +4818,26 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count, ...@@ -4760,21 +4818,26 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
* the group buddy bitmap. This is done until whole group is scanned. * the group buddy bitmap. This is done until whole group is scanned.
*/ */
static ext4_grpblk_t static ext4_grpblk_t
ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) ext4_grpblk_t start, ext4_grpblk_t max,
ext4_grpblk_t minblocks)
{ {
void *bitmap; void *bitmap;
ext4_grpblk_t next, count = 0; ext4_grpblk_t next, count = 0;
ext4_group_t group; struct ext4_buddy e4b;
int ret = 0; int ret;
BUG_ON(e4b == NULL); ret = ext4_mb_load_buddy(sb, group, &e4b);
if (ret) {
ext4_error(sb, "Error in loading buddy "
"information for %u", group);
return ret;
}
bitmap = e4b.bd_bitmap;
bitmap = e4b->bd_bitmap;
group = e4b->bd_group;
start = (e4b->bd_info->bb_first_free > start) ?
e4b->bd_info->bb_first_free : start;
ext4_lock_group(sb, group); ext4_lock_group(sb, group);
start = (e4b.bd_info->bb_first_free > start) ?
e4b.bd_info->bb_first_free : start;
while (start < max) { while (start < max) {
start = mb_find_next_zero_bit(bitmap, max, start); start = mb_find_next_zero_bit(bitmap, max, start);
...@@ -4783,10 +4846,8 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, ...@@ -4783,10 +4846,8 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
next = mb_find_next_bit(bitmap, max, start); next = mb_find_next_bit(bitmap, max, start);
if ((next - start) >= minblocks) { if ((next - start) >= minblocks) {
ret = ext4_trim_extent(sb, start, ext4_trim_extent(sb, start,
next - start, group, e4b); next - start, group, &e4b);
if (ret < 0)
break;
count += next - start; count += next - start;
} }
start = next + 1; start = next + 1;
...@@ -4802,17 +4863,15 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, ...@@ -4802,17 +4863,15 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
ext4_lock_group(sb, group); ext4_lock_group(sb, group);
} }
if ((e4b->bd_info->bb_free - count) < minblocks) if ((e4b.bd_info->bb_free - count) < minblocks)
break; break;
} }
ext4_unlock_group(sb, group); ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
ext4_debug("trimmed %d blocks in the group %d\n", ext4_debug("trimmed %d blocks in the group %d\n",
count, group); count, group);
if (ret < 0)
count = ret;
return count; return count;
} }
...@@ -4830,11 +4889,11 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, ...@@ -4830,11 +4889,11 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
*/ */
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{ {
struct ext4_buddy e4b; struct ext4_group_info *grp;
ext4_group_t first_group, last_group; ext4_group_t first_group, last_group;
ext4_group_t group, ngroups = ext4_get_groups_count(sb); ext4_group_t group, ngroups = ext4_get_groups_count(sb);
ext4_grpblk_t cnt = 0, first_block, last_block; ext4_grpblk_t cnt = 0, first_block, last_block;
uint64_t start, len, minlen, trimmed; uint64_t start, len, minlen, trimmed = 0;
ext4_fsblk_t first_data_blk = ext4_fsblk_t first_data_blk =
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
int ret = 0; int ret = 0;
...@@ -4842,7 +4901,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) ...@@ -4842,7 +4901,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
start = range->start >> sb->s_blocksize_bits; start = range->start >> sb->s_blocksize_bits;
len = range->len >> sb->s_blocksize_bits; len = range->len >> sb->s_blocksize_bits;
minlen = range->minlen >> sb->s_blocksize_bits; minlen = range->minlen >> sb->s_blocksize_bits;
trimmed = 0;
if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
return -EINVAL; return -EINVAL;
...@@ -4863,11 +4921,12 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) ...@@ -4863,11 +4921,12 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
return -EINVAL; return -EINVAL;
for (group = first_group; group <= last_group; group++) { for (group = first_group; group <= last_group; group++) {
ret = ext4_mb_load_buddy(sb, group, &e4b); grp = ext4_get_group_info(sb, group);
if (ret) { /* We only do this if the grp has never been initialized */
ext4_error(sb, "Error in loading buddy " if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
"information for %u", group); ret = ext4_mb_init_group(sb, group);
break; if (ret)
break;
} }
/* /*
...@@ -4880,16 +4939,14 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) ...@@ -4880,16 +4939,14 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
last_block = first_block + len; last_block = first_block + len;
len -= last_block - first_block; len -= last_block - first_block;
if (e4b.bd_info->bb_free >= minlen) { if (grp->bb_free >= minlen) {
cnt = ext4_trim_all_free(sb, &e4b, first_block, cnt = ext4_trim_all_free(sb, group, first_block,
last_block, minlen); last_block, minlen);
if (cnt < 0) { if (cnt < 0) {
ret = cnt; ret = cnt;
ext4_mb_unload_buddy(&e4b);
break; break;
} }
} }
ext4_mb_unload_buddy(&e4b);
trimmed += cnt; trimmed += cnt;
first_block = 0; first_block = 0;
} }
......
...@@ -193,11 +193,6 @@ struct ext4_allocation_context { ...@@ -193,11 +193,6 @@ struct ext4_allocation_context {
__u8 ac_op; /* operation, for history only */ __u8 ac_op; /* operation, for history only */
struct page *ac_bitmap_page; struct page *ac_bitmap_page;
struct page *ac_buddy_page; struct page *ac_buddy_page;
/*
* pointer to the held semaphore upon successful
* block allocation
*/
struct rw_semaphore *alloc_semp;
struct ext4_prealloc_space *ac_pa; struct ext4_prealloc_space *ac_pa;
struct ext4_locality_group *ac_lg; struct ext4_locality_group *ac_lg;
}; };
...@@ -215,7 +210,6 @@ struct ext4_buddy { ...@@ -215,7 +210,6 @@ struct ext4_buddy {
struct super_block *bd_sb; struct super_block *bd_sb;
__u16 bd_blkbits; __u16 bd_blkbits;
ext4_group_t bd_group; ext4_group_t bd_group;
struct rw_semaphore *alloc_semp;
}; };
#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
......
...@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, ...@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* We have the extent map build with the tmp inode. * We have the extent map build with the tmp inode.
* Now copy the i_data across * Now copy the i_data across
*/ */
ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
/* /*
......
#include <linux/fs.h>
#include <linux/random.h>
#include <linux/buffer_head.h>
#include <linux/utsname.h>
#include <linux/kthread.h>
#include "ext4.h"
/*
* Write the MMP block using WRITE_SYNC to try to get the block on-disk
* faster.
*/
static int write_mmp_block(struct buffer_head *bh)
{
mark_buffer_dirty(bh);
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
submit_bh(WRITE_SYNC, bh);
wait_on_buffer(bh);
if (unlikely(!buffer_uptodate(bh)))
return 1;
return 0;
}
/*
* Read the MMP block. It _must_ be read from disk and hence we clear the
* uptodate flag on the buffer.
*/
static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
ext4_fsblk_t mmp_block)
{
struct mmp_struct *mmp;
if (*bh)
clear_buffer_uptodate(*bh);
/* This would be sb_bread(sb, mmp_block), except we need to be sure
* that the MD RAID device cache has been bypassed, and that the read
* is not blocked in the elevator. */
if (!*bh)
*bh = sb_getblk(sb, mmp_block);
if (*bh) {
get_bh(*bh);
lock_buffer(*bh);
(*bh)->b_end_io = end_buffer_read_sync;
submit_bh(READ_SYNC, *bh);
wait_on_buffer(*bh);
if (!buffer_uptodate(*bh)) {
brelse(*bh);
*bh = NULL;
}
}
if (!*bh) {
ext4_warning(sb, "Error while reading MMP block %llu",
mmp_block);
return -EIO;
}
mmp = (struct mmp_struct *)((*bh)->b_data);
if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
return -EINVAL;
return 0;
}
/*
* Dump as much information as possible to help the admin.
*/
void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
const char *function, unsigned int line, const char *msg)
{
__ext4_warning(sb, function, line, msg);
__ext4_warning(sb, function, line,
"MMP failure info: last update time: %llu, last update "
"node: %s, last update device: %s\n",
(long long unsigned int) le64_to_cpu(mmp->mmp_time),
mmp->mmp_nodename, mmp->mmp_bdevname);
}
/*
* kmmpd will update the MMP sequence every s_mmp_update_interval seconds
*/
static int kmmpd(void *data)
{
struct super_block *sb = ((struct mmpd_data *) data)->sb;
struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct mmp_struct *mmp;
ext4_fsblk_t mmp_block;
u32 seq = 0;
unsigned long failed_writes = 0;
int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
unsigned mmp_check_interval;
unsigned long last_update_time;
unsigned long diff;
int retval;
mmp_block = le64_to_cpu(es->s_mmp_block);
mmp = (struct mmp_struct *)(bh->b_data);
mmp->mmp_time = cpu_to_le64(get_seconds());
/*
* Start with the higher mmp_check_interval and reduce it if
* the MMP block is being updated on time.
*/
mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
EXT4_MMP_MIN_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
bdevname(bh->b_bdev, mmp->mmp_bdevname);
memcpy(mmp->mmp_nodename, init_utsname()->sysname,
sizeof(mmp->mmp_nodename));
while (!kthread_should_stop()) {
if (++seq > EXT4_MMP_SEQ_MAX)
seq = 1;
mmp->mmp_seq = cpu_to_le32(seq);
mmp->mmp_time = cpu_to_le64(get_seconds());
last_update_time = jiffies;
retval = write_mmp_block(bh);
/*
* Don't spew too many error messages. Print one every
* (s_mmp_update_interval * 60) seconds.
*/
if (retval && (failed_writes % 60) == 0) {
ext4_error(sb, "Error writing to MMP block");
failed_writes++;
}
if (!(le32_to_cpu(es->s_feature_incompat) &
EXT4_FEATURE_INCOMPAT_MMP)) {
ext4_warning(sb, "kmmpd being stopped since MMP feature"
" has been disabled.");
EXT4_SB(sb)->s_mmp_tsk = NULL;
goto failed;
}
if (sb->s_flags & MS_RDONLY) {
ext4_warning(sb, "kmmpd being stopped since filesystem "
"has been remounted as readonly.");
EXT4_SB(sb)->s_mmp_tsk = NULL;
goto failed;
}
diff = jiffies - last_update_time;
if (diff < mmp_update_interval * HZ)
schedule_timeout_interruptible(mmp_update_interval *
HZ - diff);
/*
* We need to make sure that more than mmp_check_interval
* seconds have not passed since writing. If that has happened
* we need to check if the MMP block is as we left it.
*/
diff = jiffies - last_update_time;
if (diff > mmp_check_interval * HZ) {
struct buffer_head *bh_check = NULL;
struct mmp_struct *mmp_check;
retval = read_mmp_block(sb, &bh_check, mmp_block);
if (retval) {
ext4_error(sb, "error reading MMP data: %d",
retval);
EXT4_SB(sb)->s_mmp_tsk = NULL;
goto failed;
}
mmp_check = (struct mmp_struct *)(bh_check->b_data);
if (mmp->mmp_seq != mmp_check->mmp_seq ||
memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
sizeof(mmp->mmp_nodename))) {
dump_mmp_msg(sb, mmp_check,
"Error while updating MMP info. "
"The filesystem seems to have been"
" multiply mounted.");
ext4_error(sb, "abort");
goto failed;
}
put_bh(bh_check);
}
/*
* Adjust the mmp_check_interval depending on how much time
* it took for the MMP block to be written.
*/
mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
EXT4_MMP_MAX_CHECK_INTERVAL),
EXT4_MMP_MIN_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
}
/*
* Unmount seems to be clean.
*/
mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
mmp->mmp_time = cpu_to_le64(get_seconds());
retval = write_mmp_block(bh);
failed:
kfree(data);
brelse(bh);
return retval;
}
/*
* Get a random new sequence number but make sure it is not greater than
* EXT4_MMP_SEQ_MAX.
*/
static unsigned int mmp_new_seq(void)
{
u32 new_seq;
do {
get_random_bytes(&new_seq, sizeof(u32));
} while (new_seq > EXT4_MMP_SEQ_MAX);
return new_seq;
}
/*
* Protect the filesystem from being mounted more than once.
*/
int ext4_multi_mount_protect(struct super_block *sb,
ext4_fsblk_t mmp_block)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct buffer_head *bh = NULL;
struct mmp_struct *mmp = NULL;
struct mmpd_data *mmpd_data;
u32 seq;
unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
unsigned int wait_time = 0;
int retval;
if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
mmp_block >= ext4_blocks_count(es)) {
ext4_warning(sb, "Invalid MMP block in superblock");
goto failed;
}
retval = read_mmp_block(sb, &bh, mmp_block);
if (retval)
goto failed;
mmp = (struct mmp_struct *)(bh->b_data);
if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
/*
* If check_interval in MMP block is larger, use that instead of
* update_interval from the superblock.
*/
if (mmp->mmp_check_interval > mmp_check_interval)
mmp_check_interval = mmp->mmp_check_interval;
seq = le32_to_cpu(mmp->mmp_seq);
if (seq == EXT4_MMP_SEQ_CLEAN)
goto skip;
if (seq == EXT4_MMP_SEQ_FSCK) {
dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
goto failed;
}
wait_time = min(mmp_check_interval * 2 + 1,
mmp_check_interval + 60);
/* Print MMP interval if more than 20 secs. */
if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
ext4_warning(sb, "MMP interval %u higher than expected, please"
" wait.\n", wait_time * 2);
if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
ext4_warning(sb, "MMP startup interrupted, failing mount\n");
goto failed;
}
retval = read_mmp_block(sb, &bh, mmp_block);
if (retval)
goto failed;
mmp = (struct mmp_struct *)(bh->b_data);
if (seq != le32_to_cpu(mmp->mmp_seq)) {
dump_mmp_msg(sb, mmp,
"Device is already active on another node.");
goto failed;
}
skip:
/*
* write a new random sequence number.
*/
mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
retval = write_mmp_block(bh);
if (retval)
goto failed;
/*
* wait for MMP interval and check mmp_seq.
*/
if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
ext4_warning(sb, "MMP startup interrupted, failing mount\n");
goto failed;
}
retval = read_mmp_block(sb, &bh, mmp_block);
if (retval)
goto failed;
mmp = (struct mmp_struct *)(bh->b_data);
if (seq != le32_to_cpu(mmp->mmp_seq)) {
dump_mmp_msg(sb, mmp,
"Device is already active on another node.");
goto failed;
}
mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
if (!mmpd_data) {
ext4_warning(sb, "not enough memory for mmpd_data");
goto failed;
}
mmpd_data->sb = sb;
mmpd_data->bh = bh;
/*
* Start a kernel thread to update the MMP block periodically.
*/
EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
bdevname(bh->b_bdev,
mmp->mmp_bdevname));
if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
EXT4_SB(sb)->s_mmp_tsk = NULL;
kfree(mmpd_data);
ext4_warning(sb, "Unable to create kmmpd thread for %s.",
sb->s_id);
goto failed;
}
return 0;
failed:
brelse(bh);
return 1;
}
...@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, ...@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
* It needs to call wait_on_page_writeback() to wait for the * It needs to call wait_on_page_writeback() to wait for the
* writeback of the page. * writeback of the page.
*/ */
if (PageWriteback(page)) wait_on_page_writeback(page);
wait_on_page_writeback(page);
/* Release old bh and drop refs */ /* Release old bh and drop refs */
try_to_release_page(page, 0); try_to_release_page(page, 0);
......
...@@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, ...@@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
frame->at = entries; frame->at = entries;
frame->bh = bh; frame->bh = bh;
bh = bh2; bh = bh2;
ext4_handle_dirty_metadata(handle, dir, frame->bh);
ext4_handle_dirty_metadata(handle, dir, bh);
de = do_split(handle,dir, &bh, frame, &hinfo, &retval); de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
dx_release (frames); if (!de) {
if (!(de)) /*
* Even if the block split failed, we have to properly write
* out all the changes we did so far. Otherwise we can end up
* with corrupted filesystem.
*/
ext4_mark_inode_dirty(handle, dir);
dx_release(frames);
return retval; return retval;
}
dx_release(frames);
retval = add_dirent_to_buf(handle, dentry, inode, de, bh); retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
brelse(bh); brelse(bh);
...@@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir, ...@@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir,
handle_t *handle; handle_t *handle;
struct inode *inode; struct inode *inode;
int l, err, retries = 0; int l, err, retries = 0;
int credits;
l = strlen(symname)+1; l = strlen(symname)+1;
if (l > dir->i_sb->s_blocksize) if (l > dir->i_sb->s_blocksize)
...@@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir, ...@@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir,
dquot_initialize(dir); dquot_initialize(dir);
if (l > EXT4_N_BLOCKS * 4) {
/*
* For non-fast symlinks, we just allocate inode and put it on
* orphan list in the first transaction => we need bitmap,
* group descriptor, sb, inode block, quota blocks.
*/
credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
} else {
/*
* Fast symlink. We have to add entry to directory
* (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
* allocate new inode (bitmap, group descriptor, inode block,
* quota blocks, sb is already counted in previous macros).
*/
credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
}
retry: retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + handle = ext4_journal_start(dir, credits);
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle)) if (IS_ERR(handle))
return PTR_ERR(handle); return PTR_ERR(handle);
...@@ -2263,21 +2292,44 @@ static int ext4_symlink(struct inode *dir, ...@@ -2263,21 +2292,44 @@ static int ext4_symlink(struct inode *dir,
if (IS_ERR(inode)) if (IS_ERR(inode))
goto out_stop; goto out_stop;
if (l > sizeof(EXT4_I(inode)->i_data)) { if (l > EXT4_N_BLOCKS * 4) {
inode->i_op = &ext4_symlink_inode_operations; inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode); ext4_set_aops(inode);
/* /*
* page_symlink() calls into ext4_prepare/commit_write. * We cannot call page_symlink() with transaction started
* We have a transaction open. All is sweetness. It also sets * because it calls into ext4_write_begin() which can wait
* i_size in generic_commit_write(). * for transaction commit if we are running out of space
* and thus we deadlock. So we have to stop transaction now
* and restart it when symlink contents is written.
*
* To keep fs consistent in case of crash, we have to put inode
* to orphan list in the mean time.
*/ */
drop_nlink(inode);
err = ext4_orphan_add(handle, inode);
ext4_journal_stop(handle);
if (err)
goto err_drop_inode;
err = __page_symlink(inode, symname, l, 1); err = __page_symlink(inode, symname, l, 1);
if (err)
goto err_drop_inode;
/*
* Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
* + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
*/
handle = ext4_journal_start(dir,
EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto err_drop_inode;
}
inc_nlink(inode);
err = ext4_orphan_del(handle, inode);
if (err) { if (err) {
ext4_journal_stop(handle);
clear_nlink(inode); clear_nlink(inode);
unlock_new_inode(inode); goto err_drop_inode;
ext4_mark_inode_dirty(handle, inode);
iput(inode);
goto out_stop;
} }
} else { } else {
/* clear the extent format for fast symlink */ /* clear the extent format for fast symlink */
...@@ -2293,6 +2345,10 @@ static int ext4_symlink(struct inode *dir, ...@@ -2293,6 +2345,10 @@ static int ext4_symlink(struct inode *dir,
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry; goto retry;
return err; return err;
err_drop_inode:
unlock_new_inode(inode);
iput(inode);
return err;
} }
static int ext4_link(struct dentry *old_dentry, static int ext4_link(struct dentry *old_dentry,
......
...@@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error) ...@@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error)
for (i = 0; i < io_end->num_io_pages; i++) { for (i = 0; i < io_end->num_io_pages; i++) {
struct page *page = io_end->pages[i]->p_page; struct page *page = io_end->pages[i]->p_page;
struct buffer_head *bh, *head; struct buffer_head *bh, *head;
int partial_write = 0; loff_t offset;
loff_t io_end_offset;
head = page_buffers(page); if (error) {
if (error)
SetPageError(page); SetPageError(page);
BUG_ON(!head); set_bit(AS_EIO, &page->mapping->flags);
if (head->b_size != PAGE_CACHE_SIZE) { head = page_buffers(page);
loff_t offset; BUG_ON(!head);
loff_t io_end_offset = io_end->offset + io_end->size;
io_end_offset = io_end->offset + io_end->size;
offset = (sector_t) page->index << PAGE_CACHE_SHIFT; offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
bh = head; bh = head;
do { do {
if ((offset >= io_end->offset) && if ((offset >= io_end->offset) &&
(offset+bh->b_size <= io_end_offset)) { (offset+bh->b_size <= io_end_offset))
if (error) buffer_io_error(bh);
buffer_io_error(bh);
}
if (buffer_delay(bh))
partial_write = 1;
else if (!buffer_mapped(bh))
clear_buffer_dirty(bh);
else if (buffer_dirty(bh))
partial_write = 1;
offset += bh->b_size; offset += bh->b_size;
bh = bh->b_this_page; bh = bh->b_this_page;
} while (bh != head); } while (bh != head);
} }
/*
* If this is a partial write which happened to make
* all buffers uptodate then we can optimize away a
* bogus readpage() for the next read(). Here we
* 'discover' whether the page went uptodate as a
* result of this (potentially partial) write.
*/
if (!partial_write)
SetPageUptodate(page);
put_io_page(io_end->pages[i]); put_io_page(io_end->pages[i]);
} }
io_end->num_io_pages = 0; io_end->num_io_pages = 0;
......
...@@ -75,11 +75,27 @@ static void ext4_write_super(struct super_block *sb); ...@@ -75,11 +75,27 @@ static void ext4_write_super(struct super_block *sb);
static int ext4_freeze(struct super_block *sb); static int ext4_freeze(struct super_block *sb);
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data); const char *dev_name, void *data);
static inline int ext2_feature_set_ok(struct super_block *sb);
static inline int ext3_feature_set_ok(struct super_block *sb);
static int ext4_feature_set_ok(struct super_block *sb, int readonly); static int ext4_feature_set_ok(struct super_block *sb, int readonly);
static void ext4_destroy_lazyinit_thread(void); static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb); static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void); static void ext4_clear_request_list(void);
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
static struct file_system_type ext2_fs_type = {
.owner = THIS_MODULE,
.name = "ext2",
.mount = ext4_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
#else
#define IS_EXT2_SB(sb) (0)
#endif
#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
static struct file_system_type ext3_fs_type = { static struct file_system_type ext3_fs_type = {
.owner = THIS_MODULE, .owner = THIS_MODULE,
...@@ -806,6 +822,8 @@ static void ext4_put_super(struct super_block *sb) ...@@ -806,6 +822,8 @@ static void ext4_put_super(struct super_block *sb)
invalidate_bdev(sbi->journal_bdev); invalidate_bdev(sbi->journal_bdev);
ext4_blkdev_remove(sbi); ext4_blkdev_remove(sbi);
} }
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
sb->s_fs_info = NULL; sb->s_fs_info = NULL;
/* /*
* Now that we are completely done shutting down the * Now that we are completely done shutting down the
...@@ -1096,7 +1114,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) ...@@ -1096,7 +1114,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (!test_opt(sb, INIT_INODE_TABLE)) if (!test_opt(sb, INIT_INODE_TABLE))
seq_puts(seq, ",noinit_inode_table"); seq_puts(seq, ",noinit_inode_table");
else if (sbi->s_li_wait_mult) else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
seq_printf(seq, ",init_inode_table=%u", seq_printf(seq, ",init_inode_table=%u",
(unsigned) sbi->s_li_wait_mult); (unsigned) sbi->s_li_wait_mult);
...@@ -1187,9 +1205,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, ...@@ -1187,9 +1205,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off); const char *data, size_t len, loff_t off);
static const struct dquot_operations ext4_quota_operations = { static const struct dquot_operations ext4_quota_operations = {
#ifdef CONFIG_QUOTA
.get_reserved_space = ext4_get_reserved_space, .get_reserved_space = ext4_get_reserved_space,
#endif
.write_dquot = ext4_write_dquot, .write_dquot = ext4_write_dquot,
.acquire_dquot = ext4_acquire_dquot, .acquire_dquot = ext4_acquire_dquot,
.release_dquot = ext4_release_dquot, .release_dquot = ext4_release_dquot,
...@@ -1900,7 +1916,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, ...@@ -1900,7 +1916,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
ext4_msg(sb, KERN_WARNING, ext4_msg(sb, KERN_WARNING,
"warning: mounting fs with errors, " "warning: mounting fs with errors, "
"running e2fsck is recommended"); "running e2fsck is recommended");
else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
le16_to_cpu(es->s_mnt_count) >= le16_to_cpu(es->s_mnt_count) >=
(unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
ext4_msg(sb, KERN_WARNING, ext4_msg(sb, KERN_WARNING,
...@@ -2425,6 +2441,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, ...@@ -2425,6 +2441,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
EXT4_SB(sb)->s_sectors_written_start) >> 1))); EXT4_SB(sb)->s_sectors_written_start) >> 1)));
} }
static ssize_t extent_cache_hits_show(struct ext4_attr *a,
struct ext4_sb_info *sbi, char *buf)
{
return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
}
static ssize_t extent_cache_misses_show(struct ext4_attr *a,
struct ext4_sb_info *sbi, char *buf)
{
return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
}
static ssize_t inode_readahead_blks_store(struct ext4_attr *a, static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
struct ext4_sb_info *sbi, struct ext4_sb_info *sbi,
const char *buf, size_t count) const char *buf, size_t count)
...@@ -2482,6 +2510,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) ...@@ -2482,6 +2510,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
EXT4_RO_ATTR(delayed_allocation_blocks); EXT4_RO_ATTR(delayed_allocation_blocks);
EXT4_RO_ATTR(session_write_kbytes); EXT4_RO_ATTR(session_write_kbytes);
EXT4_RO_ATTR(lifetime_write_kbytes); EXT4_RO_ATTR(lifetime_write_kbytes);
EXT4_RO_ATTR(extent_cache_hits);
EXT4_RO_ATTR(extent_cache_misses);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
inode_readahead_blks_store, s_inode_readahead_blks); inode_readahead_blks_store, s_inode_readahead_blks);
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
...@@ -2497,6 +2527,8 @@ static struct attribute *ext4_attrs[] = { ...@@ -2497,6 +2527,8 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks), ATTR_LIST(delayed_allocation_blocks),
ATTR_LIST(session_write_kbytes), ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(lifetime_write_kbytes),
ATTR_LIST(extent_cache_hits),
ATTR_LIST(extent_cache_misses),
ATTR_LIST(inode_readahead_blks), ATTR_LIST(inode_readahead_blks),
ATTR_LIST(inode_goal), ATTR_LIST(inode_goal),
ATTR_LIST(mb_stats), ATTR_LIST(mb_stats),
...@@ -2659,12 +2691,6 @@ static void print_daily_error_info(unsigned long arg) ...@@ -2659,12 +2691,6 @@ static void print_daily_error_info(unsigned long arg)
mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
} }
static void ext4_lazyinode_timeout(unsigned long data)
{
struct task_struct *p = (struct task_struct *)data;
wake_up_process(p);
}
/* Find next suitable group and run ext4_init_inode_table */ /* Find next suitable group and run ext4_init_inode_table */
static int ext4_run_li_request(struct ext4_li_request *elr) static int ext4_run_li_request(struct ext4_li_request *elr)
{ {
...@@ -2696,11 +2722,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr) ...@@ -2696,11 +2722,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = ext4_init_inode_table(sb, group, ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1); elr->lr_timeout ? 0 : 1);
if (elr->lr_timeout == 0) { if (elr->lr_timeout == 0) {
timeout = jiffies - timeout; timeout = (jiffies - timeout) *
if (elr->lr_sbi->s_li_wait_mult) elr->lr_sbi->s_li_wait_mult;
timeout *= elr->lr_sbi->s_li_wait_mult;
else
timeout *= 20;
elr->lr_timeout = timeout; elr->lr_timeout = timeout;
} }
elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_sched = jiffies + elr->lr_timeout;
...@@ -2712,7 +2735,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr) ...@@ -2712,7 +2735,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
/* /*
* Remove lr_request from the list_request and free the * Remove lr_request from the list_request and free the
* request tructure. Should be called with li_list_mtx held * request structure. Should be called with li_list_mtx held
*/ */
static void ext4_remove_li_request(struct ext4_li_request *elr) static void ext4_remove_li_request(struct ext4_li_request *elr)
{ {
...@@ -2730,14 +2753,16 @@ static void ext4_remove_li_request(struct ext4_li_request *elr) ...@@ -2730,14 +2753,16 @@ static void ext4_remove_li_request(struct ext4_li_request *elr)
static void ext4_unregister_li_request(struct super_block *sb) static void ext4_unregister_li_request(struct super_block *sb)
{ {
struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request; mutex_lock(&ext4_li_mtx);
if (!ext4_li_info) {
if (!ext4_li_info) mutex_unlock(&ext4_li_mtx);
return; return;
}
mutex_lock(&ext4_li_info->li_list_mtx); mutex_lock(&ext4_li_info->li_list_mtx);
ext4_remove_li_request(elr); ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
mutex_unlock(&ext4_li_info->li_list_mtx); mutex_unlock(&ext4_li_info->li_list_mtx);
mutex_unlock(&ext4_li_mtx);
} }
static struct task_struct *ext4_lazyinit_task; static struct task_struct *ext4_lazyinit_task;
...@@ -2756,17 +2781,10 @@ static int ext4_lazyinit_thread(void *arg) ...@@ -2756,17 +2781,10 @@ static int ext4_lazyinit_thread(void *arg)
struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
struct list_head *pos, *n; struct list_head *pos, *n;
struct ext4_li_request *elr; struct ext4_li_request *elr;
unsigned long next_wakeup; unsigned long next_wakeup, cur;
DEFINE_WAIT(wait);
BUG_ON(NULL == eli); BUG_ON(NULL == eli);
eli->li_timer.data = (unsigned long)current;
eli->li_timer.function = ext4_lazyinode_timeout;
eli->li_task = current;
wake_up(&eli->li_wait_task);
cont_thread: cont_thread:
while (true) { while (true) {
next_wakeup = MAX_JIFFY_OFFSET; next_wakeup = MAX_JIFFY_OFFSET;
...@@ -2797,19 +2815,15 @@ static int ext4_lazyinit_thread(void *arg) ...@@ -2797,19 +2815,15 @@ static int ext4_lazyinit_thread(void *arg)
if (freezing(current)) if (freezing(current))
refrigerator(); refrigerator();
if ((time_after_eq(jiffies, next_wakeup)) || cur = jiffies;
if ((time_after_eq(cur, next_wakeup)) ||
(MAX_JIFFY_OFFSET == next_wakeup)) { (MAX_JIFFY_OFFSET == next_wakeup)) {
cond_resched(); cond_resched();
continue; continue;
} }
eli->li_timer.expires = next_wakeup; schedule_timeout_interruptible(next_wakeup - cur);
add_timer(&eli->li_timer);
prepare_to_wait(&eli->li_wait_daemon, &wait,
TASK_INTERRUPTIBLE);
if (time_before(jiffies, next_wakeup))
schedule();
finish_wait(&eli->li_wait_daemon, &wait);
if (kthread_should_stop()) { if (kthread_should_stop()) {
ext4_clear_request_list(); ext4_clear_request_list();
goto exit_thread; goto exit_thread;
...@@ -2833,12 +2847,7 @@ static int ext4_lazyinit_thread(void *arg) ...@@ -2833,12 +2847,7 @@ static int ext4_lazyinit_thread(void *arg)
goto cont_thread; goto cont_thread;
} }
mutex_unlock(&eli->li_list_mtx); mutex_unlock(&eli->li_list_mtx);
del_timer_sync(&ext4_li_info->li_timer);
eli->li_task = NULL;
wake_up(&eli->li_wait_task);
kfree(ext4_li_info); kfree(ext4_li_info);
ext4_lazyinit_task = NULL;
ext4_li_info = NULL; ext4_li_info = NULL;
mutex_unlock(&ext4_li_mtx); mutex_unlock(&ext4_li_mtx);
...@@ -2866,7 +2875,6 @@ static int ext4_run_lazyinit_thread(void) ...@@ -2866,7 +2875,6 @@ static int ext4_run_lazyinit_thread(void)
if (IS_ERR(ext4_lazyinit_task)) { if (IS_ERR(ext4_lazyinit_task)) {
int err = PTR_ERR(ext4_lazyinit_task); int err = PTR_ERR(ext4_lazyinit_task);
ext4_clear_request_list(); ext4_clear_request_list();
del_timer_sync(&ext4_li_info->li_timer);
kfree(ext4_li_info); kfree(ext4_li_info);
ext4_li_info = NULL; ext4_li_info = NULL;
printk(KERN_CRIT "EXT4: error %d creating inode table " printk(KERN_CRIT "EXT4: error %d creating inode table "
...@@ -2875,8 +2883,6 @@ static int ext4_run_lazyinit_thread(void) ...@@ -2875,8 +2883,6 @@ static int ext4_run_lazyinit_thread(void)
return err; return err;
} }
ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
return 0; return 0;
} }
...@@ -2911,13 +2917,9 @@ static int ext4_li_info_new(void) ...@@ -2911,13 +2917,9 @@ static int ext4_li_info_new(void)
if (!eli) if (!eli)
return -ENOMEM; return -ENOMEM;
eli->li_task = NULL;
INIT_LIST_HEAD(&eli->li_request_list); INIT_LIST_HEAD(&eli->li_request_list);
mutex_init(&eli->li_list_mtx); mutex_init(&eli->li_list_mtx);
init_waitqueue_head(&eli->li_wait_daemon);
init_waitqueue_head(&eli->li_wait_task);
init_timer(&eli->li_timer);
eli->li_state |= EXT4_LAZYINIT_QUIT; eli->li_state |= EXT4_LAZYINIT_QUIT;
ext4_li_info = eli; ext4_li_info = eli;
...@@ -2960,20 +2962,19 @@ static int ext4_register_li_request(struct super_block *sb, ...@@ -2960,20 +2962,19 @@ static int ext4_register_li_request(struct super_block *sb,
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
int ret = 0; int ret = 0;
if (sbi->s_li_request != NULL) if (sbi->s_li_request != NULL) {
/*
* Reset timeout so it can be computed again, because
* s_li_wait_mult might have changed.
*/
sbi->s_li_request->lr_timeout = 0;
return 0; return 0;
}
if (first_not_zeroed == ngroups || if (first_not_zeroed == ngroups ||
(sb->s_flags & MS_RDONLY) || (sb->s_flags & MS_RDONLY) ||
!test_opt(sb, INIT_INODE_TABLE)) { !test_opt(sb, INIT_INODE_TABLE))
sbi->s_li_request = NULL;
return 0; return 0;
}
if (first_not_zeroed == ngroups) {
sbi->s_li_request = NULL;
return 0;
}
elr = ext4_li_request_new(sb, first_not_zeroed); elr = ext4_li_request_new(sb, first_not_zeroed);
if (!elr) if (!elr)
...@@ -3166,6 +3167,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ...@@ -3166,6 +3167,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC); set_opt(sb, DELALLOC);
/*
* set default s_li_wait_mult for lazyinit, for the case there is
* no mount option specified.
*/
sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
&journal_devnum, &journal_ioprio, NULL, 0)) { &journal_devnum, &journal_ioprio, NULL, 0)) {
ext4_msg(sb, KERN_WARNING, ext4_msg(sb, KERN_WARNING,
...@@ -3187,6 +3194,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ...@@ -3187,6 +3194,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"feature flags set on rev 0 fs, " "feature flags set on rev 0 fs, "
"running e2fsck is recommended"); "running e2fsck is recommended");
if (IS_EXT2_SB(sb)) {
if (ext2_feature_set_ok(sb))
ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
"using the ext4 subsystem");
else {
ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
"to feature incompatibilities");
goto failed_mount;
}
}
if (IS_EXT3_SB(sb)) {
if (ext3_feature_set_ok(sb))
ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
"using the ext4 subsystem");
else {
ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
"to feature incompatibilities");
goto failed_mount;
}
}
/* /*
* Check feature flags regardless of the revision level, since we * Check feature flags regardless of the revision level, since we
* previously didn't change the revision level when setting the flags, * previously didn't change the revision level when setting the flags,
...@@ -3459,6 +3488,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ...@@ -3459,6 +3488,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_HAS_INCOMPAT_FEATURE(sb,
EXT4_FEATURE_INCOMPAT_RECOVER)); EXT4_FEATURE_INCOMPAT_RECOVER));
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
!(sb->s_flags & MS_RDONLY))
if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
goto failed_mount3;
/* /*
* The first inode we look at is the journal inode. Don't try * The first inode we look at is the journal inode. Don't try
* root first: it may be modified in the journal! * root first: it may be modified in the journal!
...@@ -3474,7 +3508,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ...@@ -3474,7 +3508,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount_wq; goto failed_mount_wq;
} else { } else {
clear_opt(sb, DATA_FLAGS); clear_opt(sb, DATA_FLAGS);
set_opt(sb, WRITEBACK_DATA);
sbi->s_journal = NULL; sbi->s_journal = NULL;
needs_recovery = 0; needs_recovery = 0;
goto no_journal; goto no_journal;
...@@ -3707,6 +3740,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ...@@ -3707,6 +3740,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyblocks_counter); percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
failed_mount2: failed_mount2:
for (i = 0; i < db_count; i++) for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]); brelse(sbi->s_group_desc[i]);
...@@ -4242,7 +4277,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ...@@ -4242,7 +4277,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
int enable_quota = 0; int enable_quota = 0;
ext4_group_t g; ext4_group_t g;
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
int err; int err = 0;
#ifdef CONFIG_QUOTA #ifdef CONFIG_QUOTA
int i; int i;
#endif #endif
...@@ -4368,6 +4403,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ...@@ -4368,6 +4403,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts; goto restore_opts;
if (!ext4_setup_super(sb, es, 0)) if (!ext4_setup_super(sb, es, 0))
sb->s_flags &= ~MS_RDONLY; sb->s_flags &= ~MS_RDONLY;
if (EXT4_HAS_INCOMPAT_FEATURE(sb,
EXT4_FEATURE_INCOMPAT_MMP))
if (ext4_multi_mount_protect(sb,
le64_to_cpu(es->s_mmp_block))) {
err = -EROFS;
goto restore_opts;
}
enable_quota = 1; enable_quota = 1;
} }
} }
...@@ -4432,6 +4474,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) ...@@ -4432,6 +4474,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es; struct ext4_super_block *es = sbi->s_es;
u64 fsid; u64 fsid;
s64 bfree;
if (test_opt(sb, MINIX_DF)) { if (test_opt(sb, MINIX_DF)) {
sbi->s_overhead_last = 0; sbi->s_overhead_last = 0;
...@@ -4475,8 +4518,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) ...@@ -4475,8 +4518,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = EXT4_SUPER_MAGIC; buf->f_type = EXT4_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize; buf->f_bsize = sb->s_blocksize;
buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
/* prevent underflow in case that few free space is available */
buf->f_bfree = max_t(s64, bfree, 0);
buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
if (buf->f_bfree < ext4_r_blocks_count(es)) if (buf->f_bfree < ext4_r_blocks_count(es))
buf->f_bavail = 0; buf->f_bavail = 0;
...@@ -4652,6 +4697,9 @@ static int ext4_quota_off(struct super_block *sb, int type) ...@@ -4652,6 +4697,9 @@ static int ext4_quota_off(struct super_block *sb, int type)
if (test_opt(sb, DELALLOC)) if (test_opt(sb, DELALLOC))
sync_filesystem(sb); sync_filesystem(sb);
if (!inode)
goto out;
/* Update modification times of quota files when userspace can /* Update modification times of quota files when userspace can
* start looking at them */ * start looking at them */
handle = ext4_journal_start(inode, 1); handle = ext4_journal_start(inode, 1);
...@@ -4772,14 +4820,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, ...@@ -4772,14 +4820,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
} }
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
static struct file_system_type ext2_fs_type = {
.owner = THIS_MODULE,
.name = "ext2",
.mount = ext4_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
static inline void register_as_ext2(void) static inline void register_as_ext2(void)
{ {
int err = register_filesystem(&ext2_fs_type); int err = register_filesystem(&ext2_fs_type);
...@@ -4792,10 +4832,22 @@ static inline void unregister_as_ext2(void) ...@@ -4792,10 +4832,22 @@ static inline void unregister_as_ext2(void)
{ {
unregister_filesystem(&ext2_fs_type); unregister_filesystem(&ext2_fs_type);
} }
static inline int ext2_feature_set_ok(struct super_block *sb)
{
if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
return 0;
if (sb->s_flags & MS_RDONLY)
return 1;
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
return 0;
return 1;
}
MODULE_ALIAS("ext2"); MODULE_ALIAS("ext2");
#else #else
static inline void register_as_ext2(void) { } static inline void register_as_ext2(void) { }
static inline void unregister_as_ext2(void) { } static inline void unregister_as_ext2(void) { }
static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
#endif #endif
#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
...@@ -4811,10 +4863,24 @@ static inline void unregister_as_ext3(void) ...@@ -4811,10 +4863,24 @@ static inline void unregister_as_ext3(void)
{ {
unregister_filesystem(&ext3_fs_type); unregister_filesystem(&ext3_fs_type);
} }
static inline int ext3_feature_set_ok(struct super_block *sb)
{
if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
return 0;
if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
return 0;
if (sb->s_flags & MS_RDONLY)
return 1;
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
return 0;
return 1;
}
MODULE_ALIAS("ext3"); MODULE_ALIAS("ext3");
#else #else
static inline void register_as_ext3(void) { } static inline void register_as_ext3(void) { }
static inline void unregister_as_ext3(void) { } static inline void unregister_as_ext3(void) { }
static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
#endif #endif
static struct file_system_type ext4_fs_type = { static struct file_system_type ext4_fs_type = {
...@@ -4898,8 +4964,8 @@ static int __init ext4_init_fs(void) ...@@ -4898,8 +4964,8 @@ static int __init ext4_init_fs(void)
err = init_inodecache(); err = init_inodecache();
if (err) if (err)
goto out1; goto out1;
register_as_ext2();
register_as_ext3(); register_as_ext3();
register_as_ext2();
err = register_filesystem(&ext4_fs_type); err = register_filesystem(&ext4_fs_type);
if (err) if (err)
goto out; goto out;
......
...@@ -820,8 +820,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ...@@ -820,8 +820,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
block = ext4_new_meta_blocks(handle, inode, block = ext4_new_meta_blocks(handle, inode, goal, 0,
goal, NULL, &error); NULL, &error);
if (error) if (error)
goto cleanup; goto cleanup;
......
...@@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal, ...@@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal,
ret = err; ret = err;
spin_lock(&journal->j_list_lock); spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction); J_ASSERT(jinode->i_transaction == commit_transaction);
commit_transaction->t_flushed_data_blocks = 1;
clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
smp_mb__after_clear_bit(); smp_mb__after_clear_bit();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
...@@ -672,12 +671,16 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -672,12 +671,16 @@ void jbd2_journal_commit_transaction(journal_t *journal)
err = 0; err = 0;
} }
write_lock(&journal->j_state_lock);
J_ASSERT(commit_transaction->t_state == T_COMMIT);
commit_transaction->t_state = T_COMMIT_DFLUSH;
write_unlock(&journal->j_state_lock);
/* /*
* If the journal is not located on the file system device, * If the journal is not located on the file system device,
* then we must flush the file system device before we issue * then we must flush the file system device before we issue
* the commit record * the commit record
*/ */
if (commit_transaction->t_flushed_data_blocks && if (commit_transaction->t_need_data_flush &&
(journal->j_fs_dev != journal->j_dev) && (journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER)) (journal->j_flags & JBD2_BARRIER))
blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
...@@ -754,8 +757,13 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -754,8 +757,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
required. */ required. */
JBUFFER_TRACE(jh, "file as BJ_Forget"); JBUFFER_TRACE(jh, "file as BJ_Forget");
jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
/* Wake up any transactions which were waiting for this /*
IO to complete */ * Wake up any transactions which were waiting for this IO to
* complete. The barrier must be here so that changes by
* jbd2_journal_file_buffer() take effect before wake_up_bit()
* does the waitqueue check.
*/
smp_mb();
wake_up_bit(&bh->b_state, BH_Unshadow); wake_up_bit(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "brelse shadowed buffer"); JBUFFER_TRACE(jh, "brelse shadowed buffer");
__brelse(bh); __brelse(bh);
...@@ -794,6 +802,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -794,6 +802,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_abort(journal, err); jbd2_journal_abort(journal, err);
jbd_debug(3, "JBD: commit phase 5\n"); jbd_debug(3, "JBD: commit phase 5\n");
write_lock(&journal->j_state_lock);
J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
commit_transaction->t_state = T_COMMIT_JFLUSH;
write_unlock(&journal->j_state_lock);
if (!JBD2_HAS_INCOMPAT_FEATURE(journal, if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
...@@ -949,7 +961,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -949,7 +961,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd_debug(3, "JBD: commit phase 7\n"); jbd_debug(3, "JBD: commit phase 7\n");
J_ASSERT(commit_transaction->t_state == T_COMMIT); J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
commit_transaction->t_start = jiffies; commit_transaction->t_start = jiffies;
stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
......
...@@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal) ...@@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal)
int __jbd2_log_start_commit(journal_t *journal, tid_t target) int __jbd2_log_start_commit(journal_t *journal, tid_t target)
{ {
/* /*
* Are we already doing a recent enough commit? * The only transaction we can possibly wait upon is the
* currently running transaction (if it exists). Otherwise,
* the target tid must be an old one.
*/ */
if (!tid_geq(journal->j_commit_request, target)) { if (journal->j_running_transaction &&
journal->j_running_transaction->t_tid == target) {
/* /*
* We want a new commit: OK, mark the request and wakeup the * We want a new commit: OK, mark the request and wakeup the
* commit thread. We do _not_ do the commit ourselves. * commit thread. We do _not_ do the commit ourselves.
...@@ -493,7 +496,15 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) ...@@ -493,7 +496,15 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
journal->j_commit_sequence); journal->j_commit_sequence);
wake_up(&journal->j_wait_commit); wake_up(&journal->j_wait_commit);
return 1; return 1;
} } else if (!tid_geq(journal->j_commit_request, target))
/* This should never happen, but if it does, preserve
the evidence before kjournald goes into a loop and
increments j_commit_sequence beyond all recognition. */
WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
journal->j_commit_request,
journal->j_commit_sequence,
target, journal->j_running_transaction ?
journal->j_running_transaction->t_tid : 0);
return 0; return 0;
} }
...@@ -576,6 +587,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) ...@@ -576,6 +587,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
return ret; return ret;
} }
/*
* Return 1 if a given transaction has not yet sent barrier request
* connected with a transaction commit. If 0 is returned, transaction
* may or may not have sent the barrier. Used to avoid sending barrier
* twice in common cases.
*/
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
{
int ret = 0;
transaction_t *commit_trans;
if (!(journal->j_flags & JBD2_BARRIER))
return 0;
read_lock(&journal->j_state_lock);
/* Transaction already committed? */
if (tid_geq(journal->j_commit_sequence, tid))
goto out;
commit_trans = journal->j_committing_transaction;
if (!commit_trans || commit_trans->t_tid != tid) {
ret = 1;
goto out;
}
/*
* Transaction is being committed and we already proceeded to
* submitting a flush to fs partition?
*/
if (journal->j_fs_dev != journal->j_dev) {
if (!commit_trans->t_need_data_flush ||
commit_trans->t_state >= T_COMMIT_DFLUSH)
goto out;
} else {
if (commit_trans->t_state >= T_COMMIT_JFLUSH)
goto out;
}
ret = 1;
out:
read_unlock(&journal->j_state_lock);
return ret;
}
EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
/* /*
* Wait for a specified commit to complete. * Wait for a specified commit to complete.
* The caller may not hold the journal lock. * The caller may not hold the journal lock.
......
...@@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) ...@@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
*/ */
/* /*
* Update transiaction's maximum wait time, if debugging is enabled. * Update transaction's maximum wait time, if debugging is enabled.
* *
* In order for t_max_wait to be reliable, it must be protected by a * In order for t_max_wait to be reliable, it must be protected by a
* lock. But doing so will mean that start_this_handle() can not be * lock. But doing so will mean that start_this_handle() can not be
...@@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) ...@@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
* means that maximum wait time reported by the jbd2_run_stats * means that maximum wait time reported by the jbd2_run_stats
* tracepoint will always be zero. * tracepoint will always be zero.
*/ */
static inline void update_t_max_wait(transaction_t *transaction) static inline void update_t_max_wait(transaction_t *transaction,
unsigned long ts)
{ {
#ifdef CONFIG_JBD2_DEBUG #ifdef CONFIG_JBD2_DEBUG
unsigned long ts = jiffies;
if (jbd2_journal_enable_debug && if (jbd2_journal_enable_debug &&
time_after(transaction->t_start, ts)) { time_after(transaction->t_start, ts)) {
ts = jbd2_time_diff(ts, transaction->t_start); ts = jbd2_time_diff(ts, transaction->t_start);
...@@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle, ...@@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
tid_t tid; tid_t tid;
int needed, need_to_start; int needed, need_to_start;
int nblocks = handle->h_buffer_credits; int nblocks = handle->h_buffer_credits;
unsigned long ts = jiffies;
if (nblocks > journal->j_max_transaction_buffers) { if (nblocks > journal->j_max_transaction_buffers) {
printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
...@@ -271,7 +271,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle, ...@@ -271,7 +271,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
/* OK, account for the buffers that this operation expects to /* OK, account for the buffers that this operation expects to
* use and add the handle to the running transaction. * use and add the handle to the running transaction.
*/ */
update_t_max_wait(transaction); update_t_max_wait(transaction, ts);
handle->h_transaction = transaction; handle->h_transaction = transaction;
atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count); atomic_inc(&transaction->t_handle_count);
...@@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks) ...@@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks)
* This function is visible to journal users (like ext3fs), so is not * This function is visible to journal users (like ext3fs), so is not
* called with the journal already locked. * called with the journal already locked.
* *
* Return a pointer to a newly allocated handle, or NULL on failure * Return a pointer to a newly allocated handle, or an ERR_PTR() value
* on failure.
*/ */
handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
{ {
...@@ -921,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) ...@@ -921,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
*/ */
JBUFFER_TRACE(jh, "cancelling revoke"); JBUFFER_TRACE(jh, "cancelling revoke");
jbd2_journal_cancel_revoke(handle, jh); jbd2_journal_cancel_revoke(handle, jh);
jbd2_journal_put_journal_head(jh);
out: out:
jbd2_journal_put_journal_head(jh);
return err; return err;
} }
...@@ -2147,6 +2148,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) ...@@ -2147,6 +2148,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
jinode->i_next_transaction == transaction) jinode->i_next_transaction == transaction)
goto done; goto done;
/*
* We only ever set this variable to 1 so the test is safe. Since
* t_need_data_flush is likely to be set, we do the test to save some
* cacheline bouncing
*/
if (!transaction->t_need_data_flush)
transaction->t_need_data_flush = 1;
/* On some different transaction's list - should be /* On some different transaction's list - should be
* the committing one */ * the committing one */
if (jinode->i_transaction) { if (jinode->i_transaction) {
......
...@@ -529,9 +529,10 @@ struct transaction_s ...@@ -529,9 +529,10 @@ struct transaction_s
enum { enum {
T_RUNNING, T_RUNNING,
T_LOCKED, T_LOCKED,
T_RUNDOWN,
T_FLUSH, T_FLUSH,
T_COMMIT, T_COMMIT,
T_COMMIT_DFLUSH,
T_COMMIT_JFLUSH,
T_FINISHED T_FINISHED
} t_state; } t_state;
...@@ -658,7 +659,9 @@ struct transaction_s ...@@ -658,7 +659,9 @@ struct transaction_s
* waiting for it to finish. * waiting for it to finish.
*/ */
unsigned int t_synchronous_commit:1; unsigned int t_synchronous_commit:1;
unsigned int t_flushed_data_blocks:1;
/* Disk flush needs to be sent to fs partition [no locking] */
int t_need_data_flush;
/* /*
* For use by the filesystem to store fs-specific data * For use by the filesystem to store fs-specific data
...@@ -1228,6 +1231,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); ...@@ -1228,6 +1231,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
int jbd2_journal_force_commit_nested(journal_t *journal); int jbd2_journal_force_commit_nested(journal_t *journal);
int jbd2_log_wait_commit(journal_t *journal, tid_t tid); int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
int jbd2_log_do_checkpoint(journal_t *journal); int jbd2_log_do_checkpoint(journal_t *journal);
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
void __jbd2_log_wait_for_space(journal_t *journal); void __jbd2_log_wait_for_space(journal_t *journal);
extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment