Commit 8016e29f authored by Harshad Shirwadkar's avatar Harshad Shirwadkar Committed by Theodore Ts'o

ext4: fast commit recovery path

This patch adds fast commit recovery path support for Ext4 file
system. We add several helper functions that are similar in spirit to
e2fsprogs journal recovery path handlers. Example of such functions
include - a simple block allocator, idempotent block bitmap update
function etc. Using these routines and the fast commit log in the fast
commit area, the recovery path (ext4_fc_replay()) performs fast commit
log recovery.
Reported-by: default avatarkernel test robot <lkp@intel.com>
Signed-off-by: default avatarHarshad Shirwadkar <harshadshirwadkar@gmail.com>
Link: https://lore.kernel.org/r/20201015203802.3597742-8-harshadshirwadkar@gmail.comSigned-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
parent 5b849b5f
......@@ -368,7 +368,12 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
struct buffer_head *bh)
{
ext4_fsblk_t blk;
struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
struct ext4_group_info *grp;
if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
return 0;
grp = ext4_get_group_info(sb, block_group);
if (buffer_verified(bh))
return 0;
......
......@@ -1170,6 +1170,7 @@ struct ext4_inode_info {
#define EXT4_FC_COMMITTING 0x0010 /* File system underoing a fast
* commit.
*/
#define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */
/*
* Misc. filesystem flags
......@@ -1666,6 +1667,10 @@ struct ext4_sb_info {
struct buffer_head *s_fc_bh;
struct ext4_fc_stats s_fc_stats;
u64 s_fc_avg_commit_time;
#ifdef CONFIG_EXT4_DEBUG
int s_fc_debug_max_replay;
#endif
struct ext4_fc_replay_state s_fc_replay_state;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
......@@ -2708,6 +2713,7 @@ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
struct dx_hash_info *hinfo);
/* ialloc.c */
extern int ext4_mark_inode_used(struct super_block *sb, int ino);
extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
const struct qstr *qstr, __u32 goal,
uid_t *owner, __u32 i_flags,
......@@ -2749,6 +2755,8 @@ void ext4_fc_stop_ineligible(struct super_block *sb);
void ext4_fc_start_update(struct inode *inode);
void ext4_fc_stop_update(struct inode *inode);
void ext4_fc_del(struct inode *inode);
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
void ext4_fc_replay_cleanup(struct super_block *sb);
int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
int __init ext4_fc_init_dentry_cache(void);
......@@ -2781,8 +2789,12 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
int len, int state);
/* inode.c */
void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
......@@ -2829,6 +2841,8 @@ extern int ext4_sync_inode(handle_t *, struct inode *);
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
struct ext4_iloc *iloc);
extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
......@@ -2862,12 +2876,15 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
extern void ext4_reset_inode_seed(struct inode *inode);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
extern int ext4_ind_migrate(struct inode *inode);
/* namei.c */
extern int ext4_init_new_dir(handle_t *handle, struct inode *dir,
struct inode *inode);
extern int ext4_dirblock_csum_verify(struct inode *inode,
struct buffer_head *bh);
extern int ext4_orphan_add(handle_t *, struct inode *);
......@@ -3447,6 +3464,10 @@ extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
extern int ext4_ci_compare(const struct inode *parent,
const struct qstr *fname,
const struct qstr *entry, bool quick);
extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
struct inode *inode);
extern int __ext4_link(struct inode *dir, struct inode *inode,
struct dentry *dentry);
#define S_SHIFT 12
static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
......@@ -3547,6 +3568,11 @@ extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
int check_cred, int restart_cred,
int revoke_cred);
extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end);
extern int ext4_ext_replay_set_iblocks(struct inode *inode);
extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
int len, int unwritten, ext4_fsblk_t pblk);
extern int ext4_ext_clear_bb(struct inode *inode);
/* move_extent.c */
......
......@@ -100,7 +100,7 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
return ERR_PTR(err);
journal = EXT4_SB(sb)->s_journal;
if (!journal)
if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
return ext4_get_nojournal();
return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
GFP_NOFS, type, line);
......
......@@ -5804,3 +5804,264 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
return err ? err : mapped;
}
/*
* Updates physical block address and unwritten status of extent
* starting at lblk start and of len. If such an extent doesn't exist,
* this function splits the extent tree appropriately to create an
* extent like this. This function is called in the fast commit
* replay path. Returns 0 on success and error on failure.
*/
int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
int len, int unwritten, ext4_fsblk_t pblk)
{
struct ext4_ext_path *path = NULL, *ppath;
struct ext4_extent *ex;
int ret;
path = ext4_find_extent(inode, start, NULL, 0);
if (!path)
return -EINVAL;
ex = path[path->p_depth].p_ext;
if (!ex) {
ret = -EFSCORRUPTED;
goto out;
}
if (le32_to_cpu(ex->ee_block) != start ||
ext4_ext_get_actual_len(ex) != len) {
/* We need to split this extent to match our extent first */
ppath = path;
down_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1);
up_write(&EXT4_I(inode)->i_data_sem);
if (ret)
goto out;
kfree(path);
path = ext4_find_extent(inode, start, NULL, 0);
if (IS_ERR(path))
return -1;
ppath = path;
ex = path[path->p_depth].p_ext;
WARN_ON(le32_to_cpu(ex->ee_block) != start);
if (ext4_ext_get_actual_len(ex) != len) {
down_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_force_split_extent_at(NULL, inode, &ppath,
start + len, 1);
up_write(&EXT4_I(inode)->i_data_sem);
if (ret)
goto out;
kfree(path);
path = ext4_find_extent(inode, start, NULL, 0);
if (IS_ERR(path))
return -EINVAL;
ex = path[path->p_depth].p_ext;
}
}
if (unwritten)
ext4_ext_mark_unwritten(ex);
else
ext4_ext_mark_initialized(ex);
ext4_ext_store_pblock(ex, pblk);
down_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
up_write(&EXT4_I(inode)->i_data_sem);
out:
ext4_ext_drop_refs(path);
kfree(path);
ext4_mark_inode_dirty(NULL, inode);
return ret;
}
/* Try to shrink the extent tree */
void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
{
struct ext4_ext_path *path = NULL;
struct ext4_extent *ex;
ext4_lblk_t old_cur, cur = 0;
while (cur < end) {
path = ext4_find_extent(inode, cur, NULL, 0);
if (IS_ERR(path))
return;
ex = path[path->p_depth].p_ext;
if (!ex) {
ext4_ext_drop_refs(path);
kfree(path);
ext4_mark_inode_dirty(NULL, inode);
return;
}
old_cur = cur;
cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
if (cur <= old_cur)
cur = old_cur + 1;
ext4_ext_try_to_merge(NULL, inode, path, ex);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
up_write(&EXT4_I(inode)->i_data_sem);
ext4_mark_inode_dirty(NULL, inode);
ext4_ext_drop_refs(path);
kfree(path);
}
}
/* Check if *cur is a hole and if it is, skip it */
static void skip_hole(struct inode *inode, ext4_lblk_t *cur)
{
int ret;
struct ext4_map_blocks map;
map.m_lblk = *cur;
map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur;
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret != 0)
return;
*cur = *cur + map.m_len;
}
/* Count number of blocks used by this inode and update i_blocks */
int ext4_ext_replay_set_iblocks(struct inode *inode)
{
struct ext4_ext_path *path = NULL, *path2 = NULL;
struct ext4_extent *ex;
ext4_lblk_t cur = 0, end;
int numblks = 0, i, ret = 0;
ext4_fsblk_t cmp1, cmp2;
struct ext4_map_blocks map;
/* Determin the size of the file first */
path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
EXT4_EX_NOCACHE);
if (IS_ERR(path))
return PTR_ERR(path);
ex = path[path->p_depth].p_ext;
if (!ex) {
ext4_ext_drop_refs(path);
kfree(path);
goto out;
}
end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
ext4_ext_drop_refs(path);
kfree(path);
/* Count the number of data blocks */
cur = 0;
while (cur < end) {
map.m_lblk = cur;
map.m_len = end - cur;
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret < 0)
break;
if (ret > 0)
numblks += ret;
cur = cur + map.m_len;
}
/*
* Count the number of extent tree blocks. We do it by looking up
* two successive extents and determining the difference between
* their paths. When path is different for 2 successive extents
* we compare the blocks in the path at each level and increment
* iblocks by total number of differences found.
*/
cur = 0;
skip_hole(inode, &cur);
path = ext4_find_extent(inode, cur, NULL, 0);
if (IS_ERR(path))
goto out;
numblks += path->p_depth;
ext4_ext_drop_refs(path);
kfree(path);
while (cur < end) {
path = ext4_find_extent(inode, cur, NULL, 0);
if (IS_ERR(path))
break;
ex = path[path->p_depth].p_ext;
if (!ex) {
ext4_ext_drop_refs(path);
kfree(path);
return 0;
}
cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
ext4_ext_get_actual_len(ex));
skip_hole(inode, &cur);
path2 = ext4_find_extent(inode, cur, NULL, 0);
if (IS_ERR(path2)) {
ext4_ext_drop_refs(path);
kfree(path);
break;
}
ex = path2[path2->p_depth].p_ext;
for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
cmp1 = cmp2 = 0;
if (i <= path->p_depth)
cmp1 = path[i].p_bh ?
path[i].p_bh->b_blocknr : 0;
if (i <= path2->p_depth)
cmp2 = path2[i].p_bh ?
path2[i].p_bh->b_blocknr : 0;
if (cmp1 != cmp2 && cmp2 != 0)
numblks++;
}
ext4_ext_drop_refs(path);
ext4_ext_drop_refs(path2);
kfree(path);
kfree(path2);
}
out:
inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
ext4_mark_inode_dirty(NULL, inode);
return 0;
}
int ext4_ext_clear_bb(struct inode *inode)
{
struct ext4_ext_path *path = NULL;
struct ext4_extent *ex;
ext4_lblk_t cur = 0, end;
int j, ret = 0;
struct ext4_map_blocks map;
/* Determin the size of the file first */
path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
EXT4_EX_NOCACHE);
if (IS_ERR(path))
return PTR_ERR(path);
ex = path[path->p_depth].p_ext;
if (!ex) {
ext4_ext_drop_refs(path);
kfree(path);
return 0;
}
end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
ext4_ext_drop_refs(path);
kfree(path);
cur = 0;
while (cur < end) {
map.m_lblk = cur;
map.m_len = end - cur;
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret < 0)
break;
if (ret > 0) {
path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
if (!IS_ERR_OR_NULL(path)) {
for (j = 0; j < path->p_depth; j++) {
ext4_mb_mark_bb(inode->i_sb,
path[j].p_block, 1, 0);
}
ext4_ext_drop_refs(path);
kfree(path);
}
ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
}
cur = cur + map.m_len;
}
return 0;
}
......@@ -311,6 +311,9 @@ void ext4_es_find_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es)
{
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
trace_ext4_es_find_extent_range_enter(inode, lblk);
read_lock(&EXT4_I(inode)->i_es_lock);
......@@ -361,6 +364,9 @@ bool ext4_es_scan_range(struct inode *inode,
{
bool ret;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return false;
read_lock(&EXT4_I(inode)->i_es_lock);
ret = __es_scan_range(inode, matching_fn, lblk, end);
read_unlock(&EXT4_I(inode)->i_es_lock);
......@@ -404,6 +410,9 @@ bool ext4_es_scan_clu(struct inode *inode,
{
bool ret;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return false;
read_lock(&EXT4_I(inode)->i_es_lock);
ret = __es_scan_clu(inode, matching_fn, lblk);
read_unlock(&EXT4_I(inode)->i_es_lock);
......@@ -812,6 +821,9 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
int err = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return 0;
es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
lblk, len, pblk, status, inode->i_ino);
......@@ -873,6 +885,9 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, pblk, status);
......@@ -908,6 +923,9 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct rb_node *node;
int found = 0;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return 0;
trace_ext4_es_lookup_extent_enter(inode, lblk);
es_debug("lookup extent in block %u\n", lblk);
......@@ -1419,6 +1437,9 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
int err = 0;
int reserved = 0;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return 0;
trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
lblk, len, inode->i_ino);
......@@ -1969,6 +1990,9 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
struct extent_status newes;
int err = 0;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return 0;
es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
lblk, inode->i_ino);
......
This diff is collapsed.
......@@ -116,4 +116,44 @@ struct ext4_fc_stats {
unsigned long fc_numblks;
};
#define EXT4_FC_REPLAY_REALLOC_INCREMENT 4
/*
* Physical block regions added to different inodes due to fast commit
* recovery. These are set during the SCAN phase. During the replay phase,
* our allocator excludes these from its allocation. This ensures that
* we don't accidentally allocating a block that is going to be used by
* another inode.
*/
struct ext4_fc_alloc_region {
ext4_lblk_t lblk;
ext4_fsblk_t pblk;
int ino, len;
};
/*
* Fast commit replay state.
*/
struct ext4_fc_replay_state {
int fc_replay_num_tags;
int fc_replay_expected_off;
int fc_current_pass;
int fc_cur_tag;
int fc_crc;
struct ext4_fc_alloc_region *fc_regions;
int fc_regions_size, fc_regions_used, fc_regions_valid;
int *fc_modified_inodes;
int fc_modified_inodes_used, fc_modified_inodes_size;
};
#define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1)
#define fc_for_each_tl(__start, __end, __tl) \
for (tl = (struct ext4_fc_tl *)start; \
(u8 *)tl < (u8 *)end; \
tl = (struct ext4_fc_tl *)((u8 *)tl + \
sizeof(struct ext4_fc_tl) + \
+ le16_to_cpu(tl->fc_len)))
#endif /* __FAST_COMMIT_H__ */
......@@ -82,7 +82,12 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
struct buffer_head *bh)
{
ext4_fsblk_t blk;
struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
struct ext4_group_info *grp;
if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
return 0;
grp = ext4_get_group_info(sb, block_group);
if (buffer_verified(bh))
return 0;
......@@ -281,15 +286,17 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
/* Don't bother if the inode bitmap is corrupt. */
grp = ext4_get_group_info(sb, block_group);
if (IS_ERR(bitmap_bh)) {
fatal = PTR_ERR(bitmap_bh);
bitmap_bh = NULL;
goto error_return;
}
if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
fatal = -EFSCORRUPTED;
goto error_return;
if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
grp = ext4_get_group_info(sb, block_group);
if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
fatal = -EFSCORRUPTED;
goto error_return;
}
}
BUFFER_TRACE(bitmap_bh, "get_write_access");
......@@ -739,6 +746,122 @@ static int find_inode_bit(struct super_block *sb, ext4_group_t group,
return 1;
}
int ext4_mark_inode_used(struct super_block *sb, int ino)
{
unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL;
struct ext4_group_desc *gdp;
ext4_group_t group;
int bit;
int err = -EFSCORRUPTED;
if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
goto out;
group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
if (IS_ERR(inode_bitmap_bh))
return PTR_ERR(inode_bitmap_bh);
if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) {
err = 0;
goto out;
}
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
if (!gdp || !group_desc_bh) {
err = -EINVAL;
goto out;
}
ext4_set_bit(bit, inode_bitmap_bh->b_data);
BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh);
if (err) {
ext4_std_error(sb, err);
goto out;
}
err = sync_dirty_buffer(inode_bitmap_bh);
if (err) {
ext4_std_error(sb, err);
goto out;
}
/* We may have to initialize the block bitmap if it isn't already */
if (ext4_has_group_desc_csum(sb) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
struct buffer_head *block_bitmap_bh;
block_bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(block_bitmap_bh)) {
err = PTR_ERR(block_bitmap_bh);
goto out;
}
BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh);
sync_dirty_buffer(block_bitmap_bh);
/* recheck and clear flag under lock if we still need to */
ext4_lock_group(sb, group);
if (ext4_has_group_desc_csum(sb) &&
(gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
ext4_free_group_clusters_set(sb, gdp,
ext4_free_clusters_after_init(sb, group, gdp));
ext4_block_bitmap_csum_set(sb, group, gdp,
block_bitmap_bh);
ext4_group_desc_csum_set(sb, group, gdp);
}
ext4_unlock_group(sb, group);
brelse(block_bitmap_bh);
if (err) {
ext4_std_error(sb, err);
goto out;
}
}
/* Update the relevant bg descriptor fields */
if (ext4_has_group_desc_csum(sb)) {
int free;
ext4_lock_group(sb, group); /* while we modify the bg desc */
free = EXT4_INODES_PER_GROUP(sb) -
ext4_itable_unused_count(sb, gdp);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
free = 0;
}
/*
* Check the relative inode number against the last used
* relative inode number in this group. if it is greater
* we need to update the bg_itable_unused count
*/
if (bit >= free)
ext4_itable_unused_set(sb, gdp,
(EXT4_INODES_PER_GROUP(sb) - bit - 1));
} else {
ext4_lock_group(sb, group);
}
ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
if (ext4_has_group_desc_csum(sb)) {
ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
EXT4_INODES_PER_GROUP(sb) / 8);
ext4_group_desc_csum_set(sb, group, gdp);
}
ext4_unlock_group(sb, group);
err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh);
sync_dirty_buffer(group_desc_bh);
out:
return err;
}
/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
......@@ -768,7 +891,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
struct inode *ret;
ext4_group_t i;
ext4_group_t flex_group;
struct ext4_group_info *grp;
struct ext4_group_info *grp = NULL;
int encrypt = 0;
/* Cannot create files in a deleted directory */
......@@ -906,15 +1029,21 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
if (ext4_free_inodes_count(sb, gdp) == 0)
goto next_group;
grp = ext4_get_group_info(sb, group);
/* Skip groups with already-known suspicious inode tables */
if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
goto next_group;
if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
grp = ext4_get_group_info(sb, group);
/*
* Skip groups with already-known suspicious inode
* tables
*/
if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
goto next_group;
}
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
/* Skip groups with suspicious inode tables */
if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
if (((!(sbi->s_mount_state & EXT4_FC_REPLAY))
&& EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ||
IS_ERR(inode_bitmap_bh)) {
inode_bitmap_bh = NULL;
goto next_group;
......@@ -933,7 +1062,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
goto next_group;
}
if (!handle) {
if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(dir->i_sb, line_no,
handle_type, nblocks, 0,
......@@ -1037,9 +1166,15 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
/* Update the relevant bg descriptor fields */
if (ext4_has_group_desc_csum(sb)) {
int free;
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
struct ext4_group_info *grp = NULL;
if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
grp = ext4_get_group_info(sb, group);
down_read(&grp->alloc_sem); /*
* protect vs itable
* lazyinit
*/
}
ext4_lock_group(sb, group); /* while we modify the bg desc */
free = EXT4_INODES_PER_GROUP(sb) -
ext4_itable_unused_count(sb, gdp);
......@@ -1055,7 +1190,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
if (ino > free)
ext4_itable_unused_set(sb, gdp,
(EXT4_INODES_PER_GROUP(sb) - ino));
up_read(&grp->alloc_sem);
if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
up_read(&grp->alloc_sem);
} else {
ext4_lock_group(sb, group);
}
......
......@@ -101,8 +101,8 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
return provided == calculated;
}
static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei)
void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei)
{
__u32 csum;
......@@ -514,7 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
return -EFSCORRUPTED;
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
......@@ -827,7 +828,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
int create = map_flags & EXT4_GET_BLOCKS_CREATE;
int err;
J_ASSERT(handle != NULL || create == 0);
J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
|| handle != NULL || create == 0);
map.m_lblk = block;
map.m_len = 1;
......@@ -843,7 +845,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
return ERR_PTR(-ENOMEM);
if (map.m_flags & EXT4_MAP_NEW) {
J_ASSERT(create != 0);
J_ASSERT(handle != NULL);
J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
|| (handle != NULL));
/*
* Now that we do not always journal data, we should
......@@ -4284,22 +4287,22 @@ int ext4_truncate(struct inode *inode)
* data in memory that is needed to recreate the on-disk version of this
* inode.
*/
static int __ext4_get_inode_loc(struct inode *inode,
struct ext4_iloc *iloc, int in_mem)
static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
struct ext4_iloc *iloc, int in_mem,
ext4_fsblk_t *ret_block)
{
struct ext4_group_desc *gdp;
struct buffer_head *bh;
struct super_block *sb = inode->i_sb;
ext4_fsblk_t block;
struct blk_plug plug;
int inodes_per_block, inode_offset;
iloc->bh = NULL;
if (inode->i_ino < EXT4_ROOT_INO ||
inode->i_ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
if (ino < EXT4_ROOT_INO ||
ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
return -EFSCORRUPTED;
iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
if (!gdp)
return -EIO;
......@@ -4308,7 +4311,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
* Figure out the offset within the block group inode table
*/
inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
inode_offset = ((inode->i_ino - 1) %
inode_offset = ((ino - 1) %
EXT4_INODES_PER_GROUP(sb));
block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
......@@ -4400,14 +4403,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
* has in-inode xattrs, or we don't have this inode in memory.
* Read the block from disk.
*/
trace_ext4_load_inode(inode);
trace_ext4_load_inode(sb, ino);
ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
blk_finish_plug(&plug);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
simulate_eio:
ext4_error_inode_block(inode, block, EIO,
"unable to read itable block");
if (ret_block)
*ret_block = block;
brelse(bh);
return -EIO;
}
......@@ -4417,11 +4420,43 @@ static int __ext4_get_inode_loc(struct inode *inode,
return 0;
}
static int __ext4_get_inode_loc_noinmem(struct inode *inode,
struct ext4_iloc *iloc)
{
ext4_fsblk_t err_blk;
int ret;
ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0,
&err_blk);
if (ret == -EIO)
ext4_error_inode_block(inode, err_blk, EIO,
"unable to read itable block");
return ret;
}
int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
ext4_fsblk_t err_blk;
int ret;
/* We have all inode data except xattrs in memory here. */
return __ext4_get_inode_loc(inode, iloc,
!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc,
!ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk);
if (ret == -EIO)
ext4_error_inode_block(inode, err_blk, EIO,
"unable to read itable block");
return ret;
}
int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
struct ext4_iloc *iloc)
{
return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL);
}
static bool ext4_should_enable_dax(struct inode *inode)
......@@ -4587,7 +4622,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei = EXT4_I(inode);
iloc.bh = NULL;
ret = __ext4_get_inode_loc(inode, &iloc, 0);
ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
if (ret < 0)
goto bad_inode;
raw_inode = ext4_raw_inode(&iloc);
......@@ -4633,10 +4668,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
sizeof(gen));
}
if (!ext4_inode_csum_verify(inode, raw_inode, ei) ||
ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) {
ext4_error_inode_err(inode, function, line, 0, EFSBADCRC,
"iget: checksum invalid");
if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
(!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
ext4_error_inode_err(inode, function, line, 0,
EFSBADCRC, "iget: checksum invalid");
ret = -EFSBADCRC;
goto bad_inode;
}
......@@ -4790,9 +4826,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
goto bad_inode;
} else if (!ext4_has_inline_data(inode)) {
/* validate the block references in the inode */
if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
(S_ISLNK(inode->i_mode) &&
!ext4_inode_is_fast_symlink(inode))) {
if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
(S_ISLNK(inode->i_mode) &&
!ext4_inode_is_fast_symlink(inode)))) {
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_check_inode(inode);
else
......@@ -5176,7 +5213,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
} else {
struct ext4_iloc iloc;
err = __ext4_get_inode_loc(inode, &iloc, 0);
err = __ext4_get_inode_loc_noinmem(inode, &iloc);
if (err)
return err;
/*
......
......@@ -86,7 +86,7 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
i_size_write(inode2, isize);
}
static void reset_inode_seed(struct inode *inode)
void ext4_reset_inode_seed(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
......@@ -200,8 +200,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
inode->i_generation = prandom_u32();
inode_bl->i_generation = prandom_u32();
reset_inode_seed(inode);
reset_inode_seed(inode_bl);
ext4_reset_inode_seed(inode);
ext4_reset_inode_seed(inode_bl);
ext4_discard_preallocations(inode, 0);
......
......@@ -1502,14 +1502,16 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += EXT4_C2B(sbi, block);
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0,
blocknr,
"freeing already freed block "
"(bit %u); block bitmap corrupt.",
block);
ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0,
blocknr,
"freeing already freed block (bit %u); block bitmap corrupt.",
block);
ext4_mark_group_bitmap_corrupted(
sb, e4b->bd_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
}
mb_regenerate_buddy(e4b);
goto done;
}
......@@ -3296,6 +3298,84 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
return err;
}
/*
* Idempotent helper for Ext4 fast commit replay path to set the state of
* blocks in bitmaps and update counters.
*/
void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
int len, int state)
{
struct buffer_head *bitmap_bh = NULL;
struct ext4_group_desc *gdp;
struct buffer_head *gdp_bh;
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t group;
ext4_grpblk_t blkoff;
int i, clen, err;
int already;
clen = EXT4_B2C(sbi, len);
ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
err = PTR_ERR(bitmap_bh);
bitmap_bh = NULL;
goto out_err;
}
err = -EIO;
gdp = ext4_get_group_desc(sb, group, &gdp_bh);
if (!gdp)
goto out_err;
ext4_lock_group(sb, group);
already = 0;
for (i = 0; i < clen; i++)
if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == !state)
already++;
if (state)
ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
else
mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
if (ext4_has_group_desc_csum(sb) &&
(gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
ext4_free_group_clusters_set(sb, gdp,
ext4_free_clusters_after_init(sb,
group, gdp));
}
if (state)
clen = ext4_free_group_clusters(sb, gdp) - clen + already;
else
clen = ext4_free_group_clusters(sb, gdp) + clen - already;
ext4_free_group_clusters_set(sb, gdp, clen);
ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
ext4_group_desc_csum_set(sb, group, gdp);
ext4_unlock_group(sb, group);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, group);
atomic64_sub(len,
&sbi_array_rcu_deref(sbi, s_flex_groups,
flex_group)->free_clusters);
}
err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
if (err)
goto out_err;
sync_dirty_buffer(bitmap_bh);
err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
sync_dirty_buffer(gdp_bh);
out_err:
brelse(bitmap_bh);
}
/*
* here we normalize request for locality group
* Group request are normalized to s_mb_group_prealloc, which goes to
......@@ -4272,6 +4352,9 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
return;
}
if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
return;
mb_debug(sb, "discard preallocation for inode %lu\n",
inode->i_ino);
trace_ext4_discard_preallocations(inode,
......@@ -4819,6 +4902,9 @@ static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
return ret;
}
static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
struct ext4_allocation_request *ar, int *errp);
/*
* Main entry point into mballoc to allocate blocks
* it tries to use preallocation first, then falls back
......@@ -4840,6 +4926,8 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
sbi = EXT4_SB(sb);
trace_ext4_request_blocks(ar);
if (sbi->s_mount_state & EXT4_FC_REPLAY)
return ext4_mb_new_blocks_simple(handle, ar, errp);
/* Allow to use superuser reservation for quota file */
if (ext4_is_quota_file(ar->inode))
......@@ -5067,6 +5155,102 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
return 0;
}
/*
* Simple allocator for Ext4 fast commit replay path. It searches for blocks
* linearly starting at the goal block and also excludes the blocks which
* are going to be in use after fast commit replay.
*/
static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
struct ext4_allocation_request *ar, int *errp)
{
struct buffer_head *bitmap_bh;
struct super_block *sb = ar->inode->i_sb;
ext4_group_t group;
ext4_grpblk_t blkoff;
int i;
ext4_fsblk_t goal, block;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
goal = ar->goal;
if (goal < le32_to_cpu(es->s_first_data_block) ||
goal >= ext4_blocks_count(es))
goal = le32_to_cpu(es->s_first_data_block);
ar->len = 0;
ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
for (; group < ext4_get_groups_count(sb); group++) {
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
*errp = PTR_ERR(bitmap_bh);
pr_warn("Failed to read block bitmap\n");
return 0;
}
ext4_get_group_no_and_offset(sb,
max(ext4_group_first_block_no(sb, group), goal),
NULL, &blkoff);
i = mb_find_next_zero_bit(bitmap_bh->b_data, sb->s_blocksize,
blkoff);
brelse(bitmap_bh);
if (i >= sb->s_blocksize)
continue;
if (ext4_fc_replay_check_excluded(sb,
ext4_group_first_block_no(sb, group) + i))
continue;
break;
}
if (group >= ext4_get_groups_count(sb) && i >= sb->s_blocksize)
return 0;
block = ext4_group_first_block_no(sb, group) + i;
ext4_mb_mark_bb(sb, block, 1, 1);
ar->len = 1;
return block;
}
static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
unsigned long count)
{
struct buffer_head *bitmap_bh;
struct super_block *sb = inode->i_sb;
struct ext4_group_desc *gdp;
struct buffer_head *gdp_bh;
ext4_group_t group;
ext4_grpblk_t blkoff;
int already_freed = 0, err, i;
ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
err = PTR_ERR(bitmap_bh);
pr_warn("Failed to read block bitmap\n");
return;
}
gdp = ext4_get_group_desc(sb, group, &gdp_bh);
if (!gdp)
return;
for (i = 0; i < count; i++) {
if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
already_freed++;
}
mb_clear_bits(bitmap_bh->b_data, blkoff, count);
err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
if (err)
return;
ext4_free_group_clusters_set(
sb, gdp, ext4_free_group_clusters(sb, gdp) +
count - already_freed);
ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
ext4_group_desc_csum_set(sb, group, gdp);
ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
sync_dirty_buffer(bitmap_bh);
sync_dirty_buffer(gdp_bh);
brelse(bitmap_bh);
}
/**
* ext4_free_blocks() -- Free given blocks and update quota
* @handle: handle for this transaction
......@@ -5093,6 +5277,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
int err = 0;
int ret;
sbi = EXT4_SB(sb);
if (sbi->s_mount_state & EXT4_FC_REPLAY) {
ext4_free_blocks_simple(inode, block, count);
return;
}
might_sleep();
if (bh) {
if (block)
......@@ -5101,7 +5292,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
block = bh->b_blocknr;
}
sbi = EXT4_SB(sb);
if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
!ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks not in datazone - "
......
......@@ -2749,7 +2749,7 @@ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
return ext4_next_entry(de, blocksize);
}
static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
int ext4_init_new_dir(handle_t *handle, struct inode *dir,
struct inode *inode)
{
struct buffer_head *dir_block = NULL;
......@@ -3197,42 +3197,32 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
return retval;
}
static int ext4_unlink(struct inode *dir, struct dentry *dentry)
int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
struct inode *inode)
{
int retval;
struct inode *inode;
int retval = -ENOENT;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
handle_t *handle = NULL;
int skip_remove_dentry = 0;
if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
return -EIO;
trace_ext4_unlink_enter(dir, dentry);
/* Initialize quotas before so that eventual writes go
* in separate transaction */
retval = dquot_initialize(dir);
if (retval)
goto out_trace;
retval = dquot_initialize(d_inode(dentry));
if (retval)
goto out_trace;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (IS_ERR(bh)) {
retval = PTR_ERR(bh);
goto out_trace;
}
if (!bh) {
retval = -ENOENT;
goto out_trace;
}
bh = ext4_find_entry(dir, d_name, &de, NULL);
if (IS_ERR(bh))
return PTR_ERR(bh);
inode = d_inode(dentry);
if (!bh)
return -ENOENT;
if (le32_to_cpu(de->inode) != inode->i_ino) {
retval = -EFSCORRUPTED;
goto out_bh;
/*
* It's okay if we find dont find dentry which matches
* the inode. That's because it might have gotten
* renamed to a different inode number
*/
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
skip_remove_dentry = 1;
else
goto out_bh;
}
handle = ext4_journal_start(dir, EXT4_HT_DIR,
......@@ -3245,17 +3235,21 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto out_handle;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
goto out_handle;
if (!skip_remove_dentry) {
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto out_handle;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
goto out_handle;
} else {
retval = 0;
}
if (inode->i_nlink == 0)
ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
dentry->d_name.len, dentry->d_name.name);
d_name->len, d_name->name);
else
drop_nlink(inode);
if (!inode->i_nlink)
......@@ -3263,6 +3257,33 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
inode->i_ctime = current_time(inode);
retval = ext4_mark_inode_dirty(handle, inode);
out_handle:
ext4_journal_stop(handle);
out_bh:
brelse(bh);
return retval;
}
static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
int retval;
if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
return -EIO;
trace_ext4_unlink_enter(dir, dentry);
/*
* Initialize quotas before so that eventual writes go
* in separate transaction
*/
retval = dquot_initialize(dir);
if (retval)
goto out_trace;
retval = dquot_initialize(d_inode(dentry));
if (retval)
goto out_trace;
retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry));
if (!retval)
ext4_fc_track_unlink(d_inode(dentry), dentry);
#ifdef CONFIG_UNICODE
......@@ -3276,10 +3297,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
d_invalidate(dentry);
#endif
out_handle:
ext4_journal_stop(handle);
out_bh:
brelse(bh);
out_trace:
trace_ext4_unlink_exit(dentry, retval);
return retval;
......@@ -3360,7 +3377,8 @@ static int ext4_symlink(struct inode *dir,
*/
drop_nlink(inode);
err = ext4_orphan_add(handle, inode);
ext4_journal_stop(handle);
if (handle)
ext4_journal_stop(handle);
handle = NULL;
if (err)
goto err_drop_inode;
......@@ -3414,29 +3432,10 @@ static int ext4_symlink(struct inode *dir,
return err;
}
static int ext4_link(struct dentry *old_dentry,
struct inode *dir, struct dentry *dentry)
int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry)
{
handle_t *handle;
struct inode *inode = d_inode(old_dentry);
int err, retries = 0;
if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
err = fscrypt_prepare_link(old_dentry, dir, dentry);
if (err)
return err;
if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
(!projid_eq(EXT4_I(dir)->i_projid,
EXT4_I(old_dentry->d_inode)->i_projid)))
return -EXDEV;
err = dquot_initialize(dir);
if (err)
return err;
retry:
handle = ext4_journal_start(dir, EXT4_HT_DIR,
(EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
......@@ -3453,6 +3452,7 @@ static int ext4_link(struct dentry *old_dentry,
err = ext4_add_entry(handle, dentry, inode);
if (!err) {
ext4_fc_track_link(inode, dentry);
err = ext4_mark_inode_dirty(handle, inode);
/* this can happen only for tmpfile being
* linked the first time
......@@ -3470,6 +3470,29 @@ static int ext4_link(struct dentry *old_dentry,
return err;
}
static int ext4_link(struct dentry *old_dentry,
struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(old_dentry);
int err;
if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
err = fscrypt_prepare_link(old_dentry, dir, dentry);
if (err)
return err;
if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
(!projid_eq(EXT4_I(dir)->i_projid,
EXT4_I(old_dentry->d_inode)->i_projid)))
return -EXDEV;
err = dquot_initialize(dir);
if (err)
return err;
return __ext4_link(dir, inode, dentry);
}
/*
* Try to find buffer head where contains the parent block.
......
......@@ -1718,6 +1718,9 @@ enum {
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
Opt_prefetch_block_bitmaps, Opt_no_fc,
#ifdef CONFIG_EXT4_DEBUG
Opt_fc_debug_max_replay
#endif
};
static const match_table_t tokens = {
......@@ -1805,6 +1808,9 @@ static const match_table_t tokens = {
{Opt_init_itable, "init_itable"},
{Opt_noinit_itable, "noinit_itable"},
{Opt_no_fc, "no_fc"},
#ifdef CONFIG_EXT4_DEBUG
{Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"},
#endif
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
{Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption"},
......@@ -2034,6 +2040,9 @@ static const struct mount_opts {
MOPT_SET},
{Opt_no_fc, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY},
#ifdef CONFIG_EXT4_DEBUG
{Opt_fc_debug_max_replay, 0, MOPT_GTE0},
#endif
{Opt_err, 0, 0}
};
......@@ -2242,6 +2251,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
sbi->s_li_wait_mult = arg;
} else if (token == Opt_max_dir_size_kb) {
sbi->s_max_dir_size_kb = arg;
#ifdef CONFIG_EXT4_DEBUG
} else if (token == Opt_fc_debug_max_replay) {
sbi->s_fc_debug_max_replay = arg;
#endif
} else if (token == Opt_stripe) {
sbi->s_stripe = arg;
} else if (token == Opt_resuid) {
......@@ -4764,6 +4777,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_mount_state &= ~EXT4_FC_COMMITTING;
spin_lock_init(&sbi->s_fc_lock);
memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
sbi->s_fc_replay_state.fc_regions = NULL;
sbi->s_fc_replay_state.fc_regions_size = 0;
sbi->s_fc_replay_state.fc_regions_used = 0;
sbi->s_fc_replay_state.fc_regions_valid = 0;
sbi->s_fc_replay_state.fc_modified_inodes = NULL;
sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
sb->s_root = NULL;
......@@ -4979,6 +4999,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount4a;
}
}
ext4_fc_replay_cleanup(sb);
ext4_ext_init(sb);
err = ext4_mb_init(sb);
......
......@@ -1776,9 +1776,9 @@ TRACE_EVENT(ext4_ext_load_extent,
);
TRACE_EVENT(ext4_load_inode,
TP_PROTO(struct inode *inode),
TP_PROTO(struct super_block *sb, unsigned long ino),
TP_ARGS(inode),
TP_ARGS(sb, ino),
TP_STRUCT__entry(
__field( dev_t, dev )
......@@ -1786,8 +1786,8 @@ TRACE_EVENT(ext4_load_inode,
),
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
__entry->dev = sb->s_dev;
__entry->ino = ino;
),
TP_printk("dev %d,%d ino %ld",
......@@ -2801,6 +2801,54 @@ TRACE_EVENT(ext4_lazy_itable_init,
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group)
);
TRACE_EVENT(ext4_fc_replay_scan,
TP_PROTO(struct super_block *sb, int error, int off),
TP_ARGS(sb, error, off),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(int, error)
__field(int, off)
),
TP_fast_assign(
__entry->dev = sb->s_dev;
__entry->error = error;
__entry->off = off;
),
TP_printk("FC scan pass on dev %d,%d: error %d, off %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->error, __entry->off)
);
TRACE_EVENT(ext4_fc_replay,
TP_PROTO(struct super_block *sb, int tag, int ino, int priv1, int priv2),
TP_ARGS(sb, tag, ino, priv1, priv2),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(int, tag)
__field(int, ino)
__field(int, priv1)
__field(int, priv2)
),
TP_fast_assign(
__entry->dev = sb->s_dev;
__entry->tag = tag;
__entry->ino = ino;
__entry->priv1 = priv1;
__entry->priv2 = priv2;
),
TP_printk("FC Replay %d,%d: tag %d, ino %d, data1 %d, data2 %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->tag, __entry->ino, __entry->priv1, __entry->priv2)
);
TRACE_EVENT(ext4_fc_commit_start,
TP_PROTO(struct super_block *sb),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment