Commit 5404525b authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-4.19-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

 - fix for improper fsync after hardlink

 - fix for a corruption during file deduplication

 - use after free fixes

 - RCU warning fix

 - fix for buffered write to nodatacow file

* tag 'for-4.19-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: Fix suspicious RCU usage warning in btrfs_debug_in_rcu
  btrfs: use after free in btrfs_quota_enable
  btrfs: btrfs_shrink_device should call commit transaction at the end
  btrfs: fix qgroup_free wrong num_bytes in btrfs_subvolume_reserve_metadata
  Btrfs: fix data corruption when deduplicating between different files
  Btrfs: sync log after logging new name
  Btrfs: fix unexpected failure of nocow buffered writes after snapshotting when low on space
parents b36fdc68 b6fdfbff
......@@ -1280,6 +1280,7 @@ struct btrfs_root {
int send_in_progress;
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshotted;
atomic_t snapshot_force_cow;
/* For qgroup metadata reserved space */
spinlock_t qgroup_meta_rsv_lock;
......@@ -3390,9 +3391,9 @@ do { \
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
#endif
......@@ -3404,6 +3405,13 @@ do { \
rcu_read_unlock(); \
} while (0)
#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \
do { \
rcu_read_lock(); \
btrfs_no_printk(fs_info, fmt, ##args); \
rcu_read_unlock(); \
} while (0)
#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
......
......@@ -1187,6 +1187,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
atomic_set(&root->log_batch, 0);
refcount_set(&root->refs, 1);
atomic_set(&root->will_be_snapshotted, 0);
atomic_set(&root->snapshot_force_cow, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
......
......@@ -5800,7 +5800,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
* root: the root of the parent directory
* rsv: block reservation
* items: the number of items that we need do reservation
* qgroup_reserved: used to return the reserved size in qgroup
* use_global_rsv: allow fallback to the global block reservation
*
* This function is used to reserve the space for snapshot/subvolume
* creation and deletion. Those operations are different with the
......@@ -5810,10 +5810,10 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
* the space reservation mechanism in start_transaction().
*/
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int items,
struct btrfs_block_rsv *rsv, int items,
bool use_global_rsv)
{
u64 qgroup_num_bytes = 0;
u64 num_bytes;
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
......@@ -5821,12 +5821,11 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
/* One for parent inode, two for dir entries */
num_bytes = 3 * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
qgroup_num_bytes = 3 * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_prealloc(root,
qgroup_num_bytes, true);
if (ret)
return ret;
} else {
num_bytes = 0;
}
num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
......@@ -5838,8 +5837,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (ret == -ENOSPC && use_global_rsv)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
if (ret && num_bytes)
btrfs_qgroup_free_meta_prealloc(root, num_bytes);
if (ret && qgroup_num_bytes)
btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
return ret;
}
......
......@@ -1271,7 +1271,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
u64 disk_num_bytes;
u64 ram_bytes;
int extent_type;
int ret, err;
int ret;
int type;
int nocow;
int check_prev = 1;
......@@ -1403,11 +1403,8 @@ static noinline int run_delalloc_nocow(struct inode *inode,
* if there are pending snapshots for this root,
* we fall into common COW way.
*/
if (!nolock) {
err = btrfs_start_write_no_snapshotting(root);
if (!err)
goto out_check;
}
if (!nolock && atomic_read(&root->snapshot_force_cow))
goto out_check;
/*
* force cow if csum exists in the range.
* this ensure that csum for a given extent are
......@@ -1416,9 +1413,6 @@ static noinline int run_delalloc_nocow(struct inode *inode,
ret = csum_exist_in_range(fs_info, disk_bytenr,
num_bytes);
if (ret) {
if (!nolock)
btrfs_end_write_no_snapshotting(root);
/*
* ret could be -EIO if the above fails to read
* metadata.
......@@ -1431,11 +1425,8 @@ static noinline int run_delalloc_nocow(struct inode *inode,
WARN_ON_ONCE(nolock);
goto out_check;
}
if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
if (!nolock)
btrfs_end_write_no_snapshotting(root);
if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
goto out_check;
}
nocow = 1;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
extent_end = found_key.offset +
......@@ -1448,8 +1439,6 @@ static noinline int run_delalloc_nocow(struct inode *inode,
out_check:
if (extent_end <= start) {
path->slots[0]++;
if (!nolock && nocow)
btrfs_end_write_no_snapshotting(root);
if (nocow)
btrfs_dec_nocow_writers(fs_info, disk_bytenr);
goto next_slot;
......@@ -1471,8 +1460,6 @@ static noinline int run_delalloc_nocow(struct inode *inode,
end, page_started, nr_written, 1,
NULL);
if (ret) {
if (!nolock && nocow)
btrfs_end_write_no_snapshotting(root);
if (nocow)
btrfs_dec_nocow_writers(fs_info,
disk_bytenr);
......@@ -1492,8 +1479,6 @@ static noinline int run_delalloc_nocow(struct inode *inode,
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
if (!nolock && nocow)
btrfs_end_write_no_snapshotting(root);
if (nocow)
btrfs_dec_nocow_writers(fs_info,
disk_bytenr);
......@@ -1532,8 +1517,6 @@ static noinline int run_delalloc_nocow(struct inode *inode,
EXTENT_CLEAR_DATA_RESV,
PAGE_UNLOCK | PAGE_SET_PRIVATE2);
if (!nolock && nocow)
btrfs_end_write_no_snapshotting(root);
cur_offset = extent_end;
/*
......@@ -6639,6 +6622,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
drop_inode = 1;
} else {
struct dentry *parent = dentry->d_parent;
int ret;
err = btrfs_update_inode(trans, root, inode);
if (err)
goto fail;
......@@ -6652,7 +6637,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
true, NULL);
if (ret == BTRFS_NEED_TRANS_COMMIT) {
err = btrfs_commit_transaction(trans);
trans = NULL;
}
}
fail:
......@@ -9388,14 +9378,21 @@ static int btrfs_rename_exchange(struct inode *old_dir,
u64 new_idx = 0;
u64 root_objectid;
int ret;
int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
struct btrfs_log_ctx ctx_root;
struct btrfs_log_ctx ctx_dest;
bool sync_log_root = false;
bool sync_log_dest = false;
bool commit_transaction = false;
/* we only allow rename subvolume link between subvolumes */
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
return -EXDEV;
btrfs_init_log_ctx(&ctx_root, old_inode);
btrfs_init_log_ctx(&ctx_dest, new_inode);
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(&fs_info->subvol_sem);
......@@ -9542,15 +9539,29 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (root_log_pinned) {
parent = new_dentry->d_parent;
btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
parent);
ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
BTRFS_I(old_dir), parent,
false, &ctx_root);
if (ret == BTRFS_NEED_LOG_SYNC)
sync_log_root = true;
else if (ret == BTRFS_NEED_TRANS_COMMIT)
commit_transaction = true;
ret = 0;
btrfs_end_log_trans(root);
root_log_pinned = false;
}
if (dest_log_pinned) {
parent = old_dentry->d_parent;
btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
parent);
if (!commit_transaction) {
parent = old_dentry->d_parent;
ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
BTRFS_I(new_dir), parent,
false, &ctx_dest);
if (ret == BTRFS_NEED_LOG_SYNC)
sync_log_dest = true;
else if (ret == BTRFS_NEED_TRANS_COMMIT)
commit_transaction = true;
ret = 0;
}
btrfs_end_log_trans(dest);
dest_log_pinned = false;
}
......@@ -9583,8 +9594,26 @@ static int btrfs_rename_exchange(struct inode *old_dir,
dest_log_pinned = false;
}
}
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
if (!ret && sync_log_root && !commit_transaction) {
ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
&ctx_root);
if (ret)
commit_transaction = true;
}
if (!ret && sync_log_dest && !commit_transaction) {
ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
&ctx_dest);
if (ret)
commit_transaction = true;
}
if (commit_transaction) {
ret = btrfs_commit_transaction(trans);
} else {
int ret2;
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
}
out_notrans:
if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
......@@ -9661,6 +9690,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
int ret;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
bool log_pinned = false;
struct btrfs_log_ctx ctx;
bool sync_log = false;
bool commit_transaction = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
......@@ -9818,8 +9850,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (log_pinned) {
struct dentry *parent = new_dentry->d_parent;
btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
parent);
btrfs_init_log_ctx(&ctx, old_inode);
ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
BTRFS_I(old_dir), parent,
false, &ctx);
if (ret == BTRFS_NEED_LOG_SYNC)
sync_log = true;
else if (ret == BTRFS_NEED_TRANS_COMMIT)
commit_transaction = true;
ret = 0;
btrfs_end_log_trans(root);
log_pinned = false;
}
......@@ -9856,7 +9895,19 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
btrfs_end_log_trans(root);
log_pinned = false;
}
btrfs_end_transaction(trans);
if (!ret && sync_log) {
ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
if (ret)
commit_transaction = true;
}
if (commit_transaction) {
ret = btrfs_commit_transaction(trans);
} else {
int ret2;
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
}
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
......
......@@ -747,6 +747,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_pending_snapshot *pending_snapshot;
struct btrfs_trans_handle *trans;
int ret;
bool snapshot_force_cow = false;
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return -EINVAL;
......@@ -763,6 +764,11 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
goto free_pending;
}
/*
* Force new buffered writes to reserve space even when NOCOW is
* possible. This is to avoid later writeback (running dealloc) to
* fallback to COW mode and unexpectedly fail with ENOSPC.
*/
atomic_inc(&root->will_be_snapshotted);
smp_mb__after_atomic();
/* wait for no snapshot writes */
......@@ -773,6 +779,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto dec_and_free;
/*
* All previous writes have started writeback in NOCOW mode, so now
* we force future writes to fallback to COW mode during snapshot
* creation.
*/
atomic_inc(&root->snapshot_force_cow);
snapshot_force_cow = true;
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
......@@ -837,6 +851,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
fail:
btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
dec_and_free:
if (snapshot_force_cow)
atomic_dec(&root->snapshot_force_cow);
if (atomic_dec_and_test(&root->will_be_snapshotted))
wake_up_var(&root->will_be_snapshotted);
free_pending:
......@@ -3453,6 +3469,25 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
same_lock_start = min_t(u64, loff, dst_loff);
same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
} else {
/*
* If the source and destination inodes are different, the
* source's range end offset matches the source's i_size, that
* i_size is not a multiple of the sector size, and the
* destination range does not go past the destination's i_size,
* we must round down the length to the nearest sector size
* multiple. If we don't do this adjustment we end replacing
* with zeroes the bytes in the range that starts at the
* deduplication range's end offset and ends at the next sector
* size multiple.
*/
if (loff + olen == i_size_read(src) &&
dst_loff + len < i_size_read(dst)) {
const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize;
len = round_down(i_size_read(src), sz) - loff;
olen = len;
}
}
again:
......
......@@ -1019,10 +1019,9 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->qgroup_lock);
ret = btrfs_commit_transaction(trans);
if (ret) {
trans = NULL;
trans = NULL;
if (ret)
goto out_free_path;
}
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
......
......@@ -6025,14 +6025,25 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
* Call this after adding a new name for a file and it will properly
* update the log to reflect the new name.
*
* It will return zero if all goes well, and it will return 1 if a
* full transaction commit is required.
* @ctx can not be NULL when @sync_log is false, and should be NULL when it's
* true (because it's not used).
*
* Return value depends on whether @sync_log is true or false.
* When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
* committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
* otherwise.
* When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
* to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
* or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
* committed (without attempting to sync the log).
*/
int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
struct dentry *parent)
struct dentry *parent,
bool sync_log, struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
/*
* this will force the logging code to walk the dentry chain
......@@ -6047,9 +6058,34 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
*/
if (inode->logged_trans <= fs_info->last_trans_committed &&
(!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
return 0;
return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
BTRFS_DONT_NEED_LOG_SYNC;
if (sync_log) {
struct btrfs_log_ctx ctx2;
btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
LOG_INODE_EXISTS, &ctx2);
if (ret == BTRFS_NO_LOG_SYNC)
return BTRFS_DONT_NEED_TRANS_COMMIT;
else if (ret)
return BTRFS_NEED_TRANS_COMMIT;
ret = btrfs_sync_log(trans, inode->root, &ctx2);
if (ret)
return BTRFS_NEED_TRANS_COMMIT;
return BTRFS_DONT_NEED_TRANS_COMMIT;
}
ASSERT(ctx);
ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
LOG_INODE_EXISTS, ctx);
if (ret == BTRFS_NO_LOG_SYNC)
return BTRFS_DONT_NEED_LOG_SYNC;
else if (ret)
return BTRFS_NEED_TRANS_COMMIT;
return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
LOG_INODE_EXISTS, NULL);
return BTRFS_NEED_LOG_SYNC;
}
......@@ -71,8 +71,16 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
int for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
/* Return values for btrfs_log_new_name() */
enum {
BTRFS_DONT_NEED_TRANS_COMMIT,
BTRFS_NEED_TRANS_COMMIT,
BTRFS_DONT_NEED_LOG_SYNC,
BTRFS_NEED_LOG_SYNC,
};
int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
struct dentry *parent);
struct dentry *parent,
bool sync_log, struct btrfs_log_ctx *ctx);
#endif
......@@ -4491,7 +4491,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
btrfs_end_transaction(trans);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
} else {
ret = btrfs_commit_transaction(trans);
}
done:
btrfs_free_path(path);
if (ret) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment