Commit c2aa1a44 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull vfs dedup fixes from Dave Chinner:
 "This reworks the vfs data cloning infrastructure.

  We discovered many issues with these interfaces late in the 4.19 cycle
  - the worst of them (data corruption, setuid stripping) were fixed for
  XFS in 4.19-rc8, but a larger rework of the infrastructure fixing all
  the problems was needed. That rework is the contents of this pull
  request.

  Rework the vfs_clone_file_range and vfs_dedupe_file_range
  infrastructure to use a common .remap_file_range method and supply
  generic bounds and sanity checking functions that are shared with the
  data write path. The current VFS infrastructure has problems with
  rlimit, LFS file sizes, file time stamps, maximum filesystem file
  sizes, stripping setuid bits, etc and so they are addressed in these
  commits.

  We also introduce the ability for the ->remap_file_range methods to
  return short clones so that clones for vfs_copy_file_range() don't get
  rejected if the entire range can't be cloned. It also allows
  filesystems to sliently skip deduplication of partial EOF blocks if
  they are not capable of doing so without requiring errors to be thrown
  to userspace.

  Existing filesystems are converted to user the new remap_file_range
  method, and both XFS and ocfs2 are modified to make use of the new
  generic checking infrastructure"

* tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (28 commits)
  xfs: remove [cm]time update from reflink calls
  xfs: remove xfs_reflink_remap_range
  xfs: remove redundant remap partial EOF block checks
  xfs: support returning partial reflink results
  xfs: clean up xfs_reflink_remap_blocks call site
  xfs: fix pagecache truncation prior to reflink
  ocfs2: remove ocfs2_reflink_remap_range
  ocfs2: support partial clone range and dedupe range
  ocfs2: fix pagecache truncation prior to reflink
  ocfs2: truncate page cache for clone destination file before remapping
  vfs: clean up generic_remap_file_range_prep return value
  vfs: hide file range comparison function
  vfs: enable remap callers that can handle short operations
  vfs: plumb remap flags through the vfs dedupe functions
  vfs: plumb remap flags through the vfs clone functions
  vfs: make remap_file_range functions take and return bytes completed
  vfs: remap helper should update destination inode metadata
  vfs: pass remap flags to generic_remap_checks
  vfs: pass remap flags to generic_remap_file_range_prep
  vfs: combine the clone and dedupe into a single remap_file_range
  ...
parents b69f9e17 bf4a1fcf
...@@ -623,6 +623,11 @@ in your dentry operations instead. ...@@ -623,6 +623,11 @@ in your dentry operations instead.
On success you get a new struct file sharing the mount/dentry with the On success you get a new struct file sharing the mount/dentry with the
original, on failure - ERR_PTR(). original, on failure - ERR_PTR().
-- --
[mandatory]
->clone_file_range() and ->dedupe_file_range have been replaced with
->remap_file_range(). See Documentation/filesystems/vfs.txt for more
information.
--
[recommended] [recommended]
->lookup() instances doing an equivalent of ->lookup() instances doing an equivalent of
if (IS_ERR(inode)) if (IS_ERR(inode))
......
...@@ -883,8 +883,9 @@ struct file_operations { ...@@ -883,8 +883,9 @@ struct file_operations {
unsigned (*mmap_capabilities)(struct file *); unsigned (*mmap_capabilities)(struct file *);
#endif #endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int);
int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64); loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64); struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int); int (*fadvise)(struct file *, loff_t, loff_t, int);
}; };
...@@ -960,11 +961,18 @@ otherwise noted. ...@@ -960,11 +961,18 @@ otherwise noted.
copy_file_range: called by the copy_file_range(2) system call. copy_file_range: called by the copy_file_range(2) system call.
clone_file_range: called by the ioctl(2) system call for FICLONERANGE and remap_file_range: called by the ioctl(2) system call for FICLONERANGE and
FICLONE commands. FICLONE and FIDEDUPERANGE commands to remap file ranges. An
implementation should remap len bytes at pos_in of the source file into
dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE the dest file at pos_out. Implementations must handle callers passing
command. in len == 0; this means "remap to the end of the source file". The
return value should the number of bytes remapped, or the usual
negative error code if errors occurred before any bytes were remapped.
The remap_flags parameter accepts REMAP_FILE_* flags. If
REMAP_FILE_DEDUP is set then the implementation must only remap if the
requested file ranges have identical contents. If REMAP_CAN_SHORTEN is
set, the caller is ok with the implementation shortening the request
length to satisfy alignment or EOF requirements (or any other reason).
fadvise: possibly called by the fadvise64() system call. fadvise: possibly called by the fadvise64() system call.
......
...@@ -3201,9 +3201,6 @@ void btrfs_get_block_group_info(struct list_head *groups_list, ...@@ -3201,9 +3201,6 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space); struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs); struct btrfs_ioctl_balance_args *bargs);
int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
struct file *dst_file, loff_t dst_loff,
u64 olen);
/* file.c */ /* file.c */
int __init btrfs_auto_defrag_init(void); int __init btrfs_auto_defrag_init(void);
...@@ -3233,8 +3230,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, ...@@ -3233,8 +3230,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes, size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached); struct extent_state **cached);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len); struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
/* tree-defrag.c */ /* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
......
...@@ -3298,8 +3298,7 @@ const struct file_operations btrfs_file_operations = { ...@@ -3298,8 +3298,7 @@ const struct file_operations btrfs_file_operations = {
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_compat_ioctl, .compat_ioctl = btrfs_compat_ioctl,
#endif #endif
.clone_file_range = btrfs_clone_file_range, .remap_file_range = btrfs_remap_file_range,
.dedupe_file_range = btrfs_dedupe_file_range,
}; };
void __cold btrfs_auto_defrag_exit(void) void __cold btrfs_auto_defrag_exit(void)
......
...@@ -3629,26 +3629,6 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, ...@@ -3629,26 +3629,6 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
return ret; return ret;
} }
int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
struct file *dst_file, loff_t dst_loff,
u64 olen)
{
struct inode *src = file_inode(src_file);
struct inode *dst = file_inode(dst_file);
u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
/*
* Btrfs does not support blocksize < page_size. As a
* result, btrfs_cmp_data() won't correctly handle
* this situation without an update.
*/
return -EINVAL;
}
return btrfs_extent_same(src, src_loff, olen, dst, dst_loff);
}
static int clone_finish_inode_update(struct btrfs_trans_handle *trans, static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *inode,
u64 endoff, u64 endoff,
...@@ -4350,10 +4330,34 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, ...@@ -4350,10 +4330,34 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
return ret; return ret;
} }
int btrfs_clone_file_range(struct file *src_file, loff_t off, loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, u64 len) struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
{ {
return btrfs_clone_files(dst_file, src_file, off, len, destoff); int ret;
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
if (remap_flags & REMAP_FILE_DEDUP) {
struct inode *src = file_inode(src_file);
struct inode *dst = file_inode(dst_file);
u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
/*
* Btrfs does not support blocksize < page_size. As a
* result, btrfs_cmp_data() won't correctly handle
* this situation without an update.
*/
return -EINVAL;
}
ret = btrfs_extent_same(src, off, len, dst, destoff);
} else {
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
}
return ret < 0 ? ret : len;
} }
static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
......
...@@ -992,8 +992,9 @@ const struct inode_operations cifs_symlink_inode_ops = { ...@@ -992,8 +992,9 @@ const struct inode_operations cifs_symlink_inode_ops = {
.listxattr = cifs_listxattr, .listxattr = cifs_listxattr,
}; };
static int cifs_clone_file_range(struct file *src_file, loff_t off, static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, u64 len) struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
{ {
struct inode *src_inode = file_inode(src_file); struct inode *src_inode = file_inode(src_file);
struct inode *target_inode = file_inode(dst_file); struct inode *target_inode = file_inode(dst_file);
...@@ -1003,6 +1004,9 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off, ...@@ -1003,6 +1004,9 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off,
unsigned int xid; unsigned int xid;
int rc; int rc;
if (remap_flags & ~REMAP_FILE_ADVISORY)
return -EINVAL;
cifs_dbg(FYI, "clone range\n"); cifs_dbg(FYI, "clone range\n");
xid = get_xid(); xid = get_xid();
...@@ -1042,7 +1046,7 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off, ...@@ -1042,7 +1046,7 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off,
unlock_two_nondirectories(src_inode, target_inode); unlock_two_nondirectories(src_inode, target_inode);
out: out:
free_xid(xid); free_xid(xid);
return rc; return rc < 0 ? rc : len;
} }
ssize_t cifs_file_copychunk_range(unsigned int xid, ssize_t cifs_file_copychunk_range(unsigned int xid,
...@@ -1151,7 +1155,7 @@ const struct file_operations cifs_file_ops = { ...@@ -1151,7 +1155,7 @@ const struct file_operations cifs_file_ops = {
.llseek = cifs_llseek, .llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl, .unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range, .copy_file_range = cifs_copy_file_range,
.clone_file_range = cifs_clone_file_range, .remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease, .setlease = cifs_setlease,
.fallocate = cifs_fallocate, .fallocate = cifs_fallocate,
}; };
...@@ -1170,7 +1174,7 @@ const struct file_operations cifs_file_strict_ops = { ...@@ -1170,7 +1174,7 @@ const struct file_operations cifs_file_strict_ops = {
.llseek = cifs_llseek, .llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl, .unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range, .copy_file_range = cifs_copy_file_range,
.clone_file_range = cifs_clone_file_range, .remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease, .setlease = cifs_setlease,
.fallocate = cifs_fallocate, .fallocate = cifs_fallocate,
}; };
...@@ -1189,7 +1193,7 @@ const struct file_operations cifs_file_direct_ops = { ...@@ -1189,7 +1193,7 @@ const struct file_operations cifs_file_direct_ops = {
.splice_write = iter_file_splice_write, .splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl, .unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range, .copy_file_range = cifs_copy_file_range,
.clone_file_range = cifs_clone_file_range, .remap_file_range = cifs_remap_file_range,
.llseek = cifs_llseek, .llseek = cifs_llseek,
.setlease = cifs_setlease, .setlease = cifs_setlease,
.fallocate = cifs_fallocate, .fallocate = cifs_fallocate,
...@@ -1208,7 +1212,7 @@ const struct file_operations cifs_file_nobrl_ops = { ...@@ -1208,7 +1212,7 @@ const struct file_operations cifs_file_nobrl_ops = {
.llseek = cifs_llseek, .llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl, .unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range, .copy_file_range = cifs_copy_file_range,
.clone_file_range = cifs_clone_file_range, .remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease, .setlease = cifs_setlease,
.fallocate = cifs_fallocate, .fallocate = cifs_fallocate,
}; };
...@@ -1226,7 +1230,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { ...@@ -1226,7 +1230,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
.llseek = cifs_llseek, .llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl, .unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range, .copy_file_range = cifs_copy_file_range,
.clone_file_range = cifs_clone_file_range, .remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease, .setlease = cifs_setlease,
.fallocate = cifs_fallocate, .fallocate = cifs_fallocate,
}; };
...@@ -1244,7 +1248,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { ...@@ -1244,7 +1248,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
.splice_write = iter_file_splice_write, .splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl, .unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range, .copy_file_range = cifs_copy_file_range,
.clone_file_range = cifs_clone_file_range, .remap_file_range = cifs_remap_file_range,
.llseek = cifs_llseek, .llseek = cifs_llseek,
.setlease = cifs_setlease, .setlease = cifs_setlease,
.fallocate = cifs_fallocate, .fallocate = cifs_fallocate,
...@@ -1256,7 +1260,7 @@ const struct file_operations cifs_dir_ops = { ...@@ -1256,7 +1260,7 @@ const struct file_operations cifs_dir_ops = {
.read = generic_read_dir, .read = generic_read_dir,
.unlocked_ioctl = cifs_ioctl, .unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range, .copy_file_range = cifs_copy_file_range,
.clone_file_range = cifs_clone_file_range, .remap_file_range = cifs_remap_file_range,
.llseek = generic_file_llseek, .llseek = generic_file_llseek,
.fsync = cifs_dir_fsync, .fsync = cifs_dir_fsync,
}; };
......
...@@ -223,6 +223,7 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, ...@@ -223,6 +223,7 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
u64 off, u64 olen, u64 destoff) u64 off, u64 olen, u64 destoff)
{ {
struct fd src_file = fdget(srcfd); struct fd src_file = fdget(srcfd);
loff_t cloned;
int ret; int ret;
if (!src_file.file) if (!src_file.file)
...@@ -230,7 +231,14 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, ...@@ -230,7 +231,14 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
ret = -EXDEV; ret = -EXDEV;
if (src_file.file->f_path.mnt != dst_file->f_path.mnt) if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
goto fdput; goto fdput;
ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen); cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
olen, 0);
if (cloned < 0)
ret = cloned;
else if (olen && cloned != olen)
ret = -EINVAL;
else
ret = 0;
fdput: fdput:
fdput(src_file); fdput(src_file);
return ret; return ret;
......
...@@ -180,8 +180,9 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t ...@@ -180,8 +180,9 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
return nfs42_proc_allocate(filep, offset, len); return nfs42_proc_allocate(filep, offset, len);
} }
static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off, u64 count) struct file *dst_file, loff_t dst_off, loff_t count,
unsigned int remap_flags)
{ {
struct inode *dst_inode = file_inode(dst_file); struct inode *dst_inode = file_inode(dst_file);
struct nfs_server *server = NFS_SERVER(dst_inode); struct nfs_server *server = NFS_SERVER(dst_inode);
...@@ -190,6 +191,9 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, ...@@ -190,6 +191,9 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
bool same_inode = false; bool same_inode = false;
int ret; int ret;
if (remap_flags & ~REMAP_FILE_ADVISORY)
return -EINVAL;
/* check alignment w.r.t. clone_blksize */ /* check alignment w.r.t. clone_blksize */
ret = -EINVAL; ret = -EINVAL;
if (bs) { if (bs) {
...@@ -240,7 +244,7 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, ...@@ -240,7 +244,7 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
inode_unlock(src_inode); inode_unlock(src_inode);
} }
out: out:
return ret; return ret < 0 ? ret : count;
} }
#endif /* CONFIG_NFS_V4_2 */ #endif /* CONFIG_NFS_V4_2 */
...@@ -262,7 +266,7 @@ const struct file_operations nfs4_file_operations = { ...@@ -262,7 +266,7 @@ const struct file_operations nfs4_file_operations = {
.copy_file_range = nfs4_copy_file_range, .copy_file_range = nfs4_copy_file_range,
.llseek = nfs4_file_llseek, .llseek = nfs4_file_llseek,
.fallocate = nfs42_fallocate, .fallocate = nfs42_fallocate,
.clone_file_range = nfs42_clone_file_range, .remap_file_range = nfs42_remap_file_range,
#else #else
.llseek = nfs_file_llseek, .llseek = nfs_file_llseek,
#endif #endif
......
...@@ -541,8 +541,12 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, ...@@ -541,8 +541,12 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
u64 dst_pos, u64 count) u64 dst_pos, u64 count)
{ {
return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos, loff_t cloned;
count));
cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
if (count && cloned != count)
cloned = -EINVAL;
return nfserrno(cloned < 0 ? cloned : 0);
} }
ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
......
...@@ -2527,24 +2527,79 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) ...@@ -2527,24 +2527,79 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
return offset; return offset;
} }
static int ocfs2_file_clone_range(struct file *file_in, static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in,
loff_t pos_in, struct file *file_out, loff_t pos_out,
struct file *file_out, loff_t len, unsigned int remap_flags)
loff_t pos_out,
u64 len)
{ {
return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, struct inode *inode_in = file_inode(file_in);
len, false); struct inode *inode_out = file_inode(file_out);
} struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
struct buffer_head *in_bh = NULL, *out_bh = NULL;
bool same_inode = (inode_in == inode_out);
loff_t remapped = 0;
ssize_t ret;
static int ocfs2_file_dedupe_range(struct file *file_in, if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
loff_t pos_in, return -EINVAL;
struct file *file_out, if (!ocfs2_refcount_tree(osb))
loff_t pos_out, return -EOPNOTSUPP;
u64 len) if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
{ return -EROFS;
return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
len, true); /* Lock both files against IO */
ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
if (ret)
return ret;
/* Check file eligibility and prepare for block sharing. */
ret = -EINVAL;
if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
(OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
goto out_unlock;
ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
&len, remap_flags);
if (ret < 0 || len == 0)
goto out_unlock;
/* Lock out changes to the allocation maps and remap. */
down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
if (!same_inode)
down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
SINGLE_DEPTH_NESTING);
/* Zap any page cache for the destination file's range. */
truncate_inode_pages_range(&inode_out->i_data,
round_down(pos_out, PAGE_SIZE),
round_up(pos_out + len, PAGE_SIZE) - 1);
remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in,
inode_out, out_bh, pos_out, len);
up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
if (!same_inode)
up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
if (remapped < 0) {
ret = remapped;
mlog_errno(ret);
goto out_unlock;
}
/*
* Empty the extent map so that we may get the right extent
* record from the disk.
*/
ocfs2_extent_map_trunc(inode_in, 0);
ocfs2_extent_map_trunc(inode_out, 0);
ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
if (ret) {
mlog_errno(ret);
goto out_unlock;
}
out_unlock:
ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
return remapped > 0 ? remapped : ret;
} }
const struct inode_operations ocfs2_file_iops = { const struct inode_operations ocfs2_file_iops = {
...@@ -2586,8 +2641,7 @@ const struct file_operations ocfs2_fops = { ...@@ -2586,8 +2641,7 @@ const struct file_operations ocfs2_fops = {
.splice_read = generic_file_splice_read, .splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write, .splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate, .fallocate = ocfs2_fallocate,
.clone_file_range = ocfs2_file_clone_range, .remap_file_range = ocfs2_remap_file_range,
.dedupe_file_range = ocfs2_file_dedupe_range,
}; };
const struct file_operations ocfs2_dops = { const struct file_operations ocfs2_dops = {
...@@ -2633,8 +2687,7 @@ const struct file_operations ocfs2_fops_no_plocks = { ...@@ -2633,8 +2687,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
.splice_read = generic_file_splice_read, .splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write, .splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate, .fallocate = ocfs2_fallocate,
.clone_file_range = ocfs2_file_clone_range, .remap_file_range = ocfs2_remap_file_range,
.dedupe_file_range = ocfs2_file_dedupe_range,
}; };
const struct file_operations ocfs2_dops_no_plocks = { const struct file_operations ocfs2_dops_no_plocks = {
......
...@@ -4466,9 +4466,9 @@ int ocfs2_reflink_ioctl(struct inode *inode, ...@@ -4466,9 +4466,9 @@ int ocfs2_reflink_ioctl(struct inode *inode,
} }
/* Update destination inode size, if necessary. */ /* Update destination inode size, if necessary. */
static int ocfs2_reflink_update_dest(struct inode *dest, int ocfs2_reflink_update_dest(struct inode *dest,
struct buffer_head *d_bh, struct buffer_head *d_bh,
loff_t newlen) loff_t newlen)
{ {
handle_t *handle; handle_t *handle;
int ret; int ret;
...@@ -4505,14 +4505,14 @@ static int ocfs2_reflink_update_dest(struct inode *dest, ...@@ -4505,14 +4505,14 @@ static int ocfs2_reflink_update_dest(struct inode *dest,
} }
/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */ /* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
static int ocfs2_reflink_remap_extent(struct inode *s_inode, static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode,
struct buffer_head *s_bh, struct buffer_head *s_bh,
loff_t pos_in, loff_t pos_in,
struct inode *t_inode, struct inode *t_inode,
struct buffer_head *t_bh, struct buffer_head *t_bh,
loff_t pos_out, loff_t pos_out,
loff_t len, loff_t len,
struct ocfs2_cached_dealloc_ctxt *dealloc) struct ocfs2_cached_dealloc_ctxt *dealloc)
{ {
struct ocfs2_extent_tree s_et; struct ocfs2_extent_tree s_et;
struct ocfs2_extent_tree t_et; struct ocfs2_extent_tree t_et;
...@@ -4520,8 +4520,9 @@ static int ocfs2_reflink_remap_extent(struct inode *s_inode, ...@@ -4520,8 +4520,9 @@ static int ocfs2_reflink_remap_extent(struct inode *s_inode,
struct buffer_head *ref_root_bh = NULL; struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *ref_tree; struct ocfs2_refcount_tree *ref_tree;
struct ocfs2_super *osb; struct ocfs2_super *osb;
loff_t remapped_bytes = 0;
loff_t pstart, plen; loff_t pstart, plen;
u32 p_cluster, num_clusters, slast, spos, tpos; u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0;
unsigned int ext_flags; unsigned int ext_flags;
int ret = 0; int ret = 0;
...@@ -4603,30 +4604,34 @@ static int ocfs2_reflink_remap_extent(struct inode *s_inode, ...@@ -4603,30 +4604,34 @@ static int ocfs2_reflink_remap_extent(struct inode *s_inode,
next_loop: next_loop:
spos += num_clusters; spos += num_clusters;
tpos += num_clusters; tpos += num_clusters;
remapped_clus += num_clusters;
} }
out: goto out;
return ret;
out_unlock_refcount: out_unlock_refcount:
ocfs2_unlock_refcount_tree(osb, ref_tree, 1); ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
brelse(ref_root_bh); brelse(ref_root_bh);
return ret; out:
remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus);
remapped_bytes = min_t(loff_t, len, remapped_bytes);
return remapped_bytes > 0 ? remapped_bytes : ret;
} }
/* Set up refcount tree and remap s_inode to t_inode. */ /* Set up refcount tree and remap s_inode to t_inode. */
static int ocfs2_reflink_remap_blocks(struct inode *s_inode, loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
struct buffer_head *s_bh, struct buffer_head *s_bh,
loff_t pos_in, loff_t pos_in,
struct inode *t_inode, struct inode *t_inode,
struct buffer_head *t_bh, struct buffer_head *t_bh,
loff_t pos_out, loff_t pos_out,
loff_t len) loff_t len)
{ {
struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_cached_dealloc_ctxt dealloc;
struct ocfs2_super *osb; struct ocfs2_super *osb;
struct ocfs2_dinode *dis; struct ocfs2_dinode *dis;
struct ocfs2_dinode *dit; struct ocfs2_dinode *dit;
int ret; loff_t ret;
osb = OCFS2_SB(s_inode->i_sb); osb = OCFS2_SB(s_inode->i_sb);
dis = (struct ocfs2_dinode *)s_bh->b_data; dis = (struct ocfs2_dinode *)s_bh->b_data;
...@@ -4698,7 +4703,7 @@ static int ocfs2_reflink_remap_blocks(struct inode *s_inode, ...@@ -4698,7 +4703,7 @@ static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
/* Actually remap extents now. */ /* Actually remap extents now. */
ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh, ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
pos_out, len, &dealloc); pos_out, len, &dealloc);
if (ret) { if (ret < 0) {
mlog_errno(ret); mlog_errno(ret);
goto out; goto out;
} }
...@@ -4713,10 +4718,10 @@ static int ocfs2_reflink_remap_blocks(struct inode *s_inode, ...@@ -4713,10 +4718,10 @@ static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
} }
/* Lock an inode and grab a bh pointing to the inode. */ /* Lock an inode and grab a bh pointing to the inode. */
static int ocfs2_reflink_inodes_lock(struct inode *s_inode, int ocfs2_reflink_inodes_lock(struct inode *s_inode,
struct buffer_head **bh1, struct buffer_head **bh1,
struct inode *t_inode, struct inode *t_inode,
struct buffer_head **bh2) struct buffer_head **bh2)
{ {
struct inode *inode1; struct inode *inode1;
struct inode *inode2; struct inode *inode2;
...@@ -4801,10 +4806,10 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode, ...@@ -4801,10 +4806,10 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
} }
/* Unlock both inodes and release buffers. */ /* Unlock both inodes and release buffers. */
static void ocfs2_reflink_inodes_unlock(struct inode *s_inode, void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
struct buffer_head *s_bh, struct buffer_head *s_bh,
struct inode *t_inode, struct inode *t_inode,
struct buffer_head *t_bh) struct buffer_head *t_bh)
{ {
ocfs2_inode_unlock(s_inode, 1); ocfs2_inode_unlock(s_inode, 1);
ocfs2_rw_unlock(s_inode, 1); ocfs2_rw_unlock(s_inode, 1);
...@@ -4816,82 +4821,3 @@ static void ocfs2_reflink_inodes_unlock(struct inode *s_inode, ...@@ -4816,82 +4821,3 @@ static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
} }
unlock_two_nondirectories(s_inode, t_inode); unlock_two_nondirectories(s_inode, t_inode);
} }
/* Link a range of blocks from one file to another. */
int ocfs2_reflink_remap_range(struct file *file_in,
loff_t pos_in,
struct file *file_out,
loff_t pos_out,
u64 len,
bool is_dedupe)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
struct buffer_head *in_bh = NULL, *out_bh = NULL;
bool same_inode = (inode_in == inode_out);
ssize_t ret;
if (!ocfs2_refcount_tree(osb))
return -EOPNOTSUPP;
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
/* Lock both files against IO */
ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
if (ret)
return ret;
/* Check file eligibility and prepare for block sharing. */
ret = -EINVAL;
if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
(OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
goto out_unlock;
ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
&len, is_dedupe);
if (ret <= 0)
goto out_unlock;
/* Lock out changes to the allocation maps and remap. */
down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
if (!same_inode)
down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
SINGLE_DEPTH_NESTING);
ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
out_bh, pos_out, len);
/* Zap any page cache for the destination file's range. */
if (!ret)
truncate_inode_pages_range(&inode_out->i_data, pos_out,
PAGE_ALIGN(pos_out + len) - 1);
up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
if (!same_inode)
up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
if (ret) {
mlog_errno(ret);
goto out_unlock;
}
/*
* Empty the extent map so that we may get the right extent
* record from the disk.
*/
ocfs2_extent_map_trunc(inode_in, 0);
ocfs2_extent_map_trunc(inode_out, 0);
ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
if (ret) {
mlog_errno(ret);
goto out_unlock;
}
ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
return 0;
out_unlock:
ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
return ret;
}
...@@ -115,11 +115,23 @@ int ocfs2_reflink_ioctl(struct inode *inode, ...@@ -115,11 +115,23 @@ int ocfs2_reflink_ioctl(struct inode *inode,
const char __user *oldname, const char __user *oldname,
const char __user *newname, const char __user *newname,
bool preserve); bool preserve);
int ocfs2_reflink_remap_range(struct file *file_in, loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
loff_t pos_in, struct buffer_head *s_bh,
struct file *file_out, loff_t pos_in,
loff_t pos_out, struct inode *t_inode,
u64 len, struct buffer_head *t_bh,
bool is_dedupe); loff_t pos_out,
loff_t len);
int ocfs2_reflink_inodes_lock(struct inode *s_inode,
struct buffer_head **bh1,
struct inode *t_inode,
struct buffer_head **bh2);
void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
struct buffer_head *s_bh,
struct inode *t_inode,
struct buffer_head *t_bh);
int ocfs2_reflink_update_dest(struct inode *dest,
struct buffer_head *d_bh,
loff_t newlen);
#endif /* OCFS2_REFCOUNTTREE_H */ #endif /* OCFS2_REFCOUNTTREE_H */
...@@ -125,6 +125,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) ...@@ -125,6 +125,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
struct file *new_file; struct file *new_file;
loff_t old_pos = 0; loff_t old_pos = 0;
loff_t new_pos = 0; loff_t new_pos = 0;
loff_t cloned;
int error = 0; int error = 0;
if (len == 0) if (len == 0)
...@@ -141,11 +142,10 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) ...@@ -141,11 +142,10 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
} }
/* Try to use clone_file_range to clone up within the same fs */ /* Try to use clone_file_range to clone up within the same fs */
error = do_clone_file_range(old_file, 0, new_file, 0, len); cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
if (!error) if (cloned == len)
goto out; goto out;
/* Couldn't clone, so now we try to copy the data */ /* Couldn't clone, so now we try to copy the data */
error = 0;
/* FIXME: copy up sparse files efficiently */ /* FIXME: copy up sparse files efficiently */
while (len) { while (len) {
......
...@@ -434,14 +434,14 @@ enum ovl_copyop { ...@@ -434,14 +434,14 @@ enum ovl_copyop {
OVL_DEDUPE, OVL_DEDUPE,
}; };
static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in, static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, struct file *file_out, loff_t pos_out,
u64 len, unsigned int flags, enum ovl_copyop op) loff_t len, unsigned int flags, enum ovl_copyop op)
{ {
struct inode *inode_out = file_inode(file_out); struct inode *inode_out = file_inode(file_out);
struct fd real_in, real_out; struct fd real_in, real_out;
const struct cred *old_cred; const struct cred *old_cred;
ssize_t ret; loff_t ret;
ret = ovl_real_fdget(file_out, &real_out); ret = ovl_real_fdget(file_out, &real_out);
if (ret) if (ret)
...@@ -462,12 +462,13 @@ static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in, ...@@ -462,12 +462,13 @@ static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in,
case OVL_CLONE: case OVL_CLONE:
ret = vfs_clone_file_range(real_in.file, pos_in, ret = vfs_clone_file_range(real_in.file, pos_in,
real_out.file, pos_out, len); real_out.file, pos_out, len, flags);
break; break;
case OVL_DEDUPE: case OVL_DEDUPE:
ret = vfs_dedupe_file_range_one(real_in.file, pos_in, ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
real_out.file, pos_out, len); real_out.file, pos_out, len,
flags);
break; break;
} }
revert_creds(old_cred); revert_creds(old_cred);
...@@ -489,26 +490,31 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in, ...@@ -489,26 +490,31 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in,
OVL_COPY); OVL_COPY);
} }
static int ovl_clone_file_range(struct file *file_in, loff_t pos_in, static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len) struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags)
{ {
return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, enum ovl_copyop op;
OVL_CLONE);
} if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
if (remap_flags & REMAP_FILE_DEDUP)
op = OVL_DEDUPE;
else
op = OVL_CLONE;
static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len)
{
/* /*
* Don't copy up because of a dedupe request, this wouldn't make sense * Don't copy up because of a dedupe request, this wouldn't make sense
* most of the time (data would be duplicated instead of deduplicated). * most of the time (data would be duplicated instead of deduplicated).
*/ */
if (!ovl_inode_upper(file_inode(file_in)) || if (op == OVL_DEDUPE &&
!ovl_inode_upper(file_inode(file_out))) (!ovl_inode_upper(file_inode(file_in)) ||
!ovl_inode_upper(file_inode(file_out))))
return -EPERM; return -EPERM;
return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, return ovl_copyfile(file_in, pos_in, file_out, pos_out, len,
OVL_DEDUPE); remap_flags, op);
} }
const struct file_operations ovl_file_operations = { const struct file_operations ovl_file_operations = {
...@@ -525,6 +531,5 @@ const struct file_operations ovl_file_operations = { ...@@ -525,6 +531,5 @@ const struct file_operations ovl_file_operations = {
.compat_ioctl = ovl_compat_ioctl, .compat_ioctl = ovl_compat_ioctl,
.copy_file_range = ovl_copy_file_range, .copy_file_range = ovl_copy_file_range,
.clone_file_range = ovl_clone_file_range, .remap_file_range = ovl_remap_file_range,
.dedupe_file_range = ovl_dedupe_file_range,
}; };
...@@ -1587,11 +1587,15 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, ...@@ -1587,11 +1587,15 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
* Try cloning first, this is supported by more file systems, and * Try cloning first, this is supported by more file systems, and
* more efficient if both clone and copy are supported (e.g. NFS). * more efficient if both clone and copy are supported (e.g. NFS).
*/ */
if (file_in->f_op->clone_file_range) { if (file_in->f_op->remap_file_range) {
ret = file_in->f_op->clone_file_range(file_in, pos_in, loff_t cloned;
file_out, pos_out, len);
if (ret == 0) { cloned = file_in->f_op->remap_file_range(file_in, pos_in,
ret = len; file_out, pos_out,
min_t(loff_t, MAX_RW_COUNT, len),
REMAP_FILE_CAN_SHORTEN);
if (cloned > 0) {
ret = cloned;
goto done; goto done;
} }
} }
...@@ -1685,11 +1689,12 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, ...@@ -1685,11 +1689,12 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
return ret; return ret;
} }
static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
bool write)
{ {
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
if (unlikely(pos < 0)) if (unlikely(pos < 0 || len < 0))
return -EINVAL; return -EINVAL;
if (unlikely((loff_t) (pos + len) < 0)) if (unlikely((loff_t) (pos + len) < 0))
...@@ -1707,22 +1712,150 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) ...@@ -1707,22 +1712,150 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
return security_file_permission(file, write ? MAY_WRITE : MAY_READ); return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
} }
/*
* Ensure that we don't remap a partial EOF block in the middle of something
* else. Assume that the offsets have already been checked for block
* alignment.
*
* For deduplication we always scale down to the previous block because we
* can't meaningfully compare post-EOF contents.
*
* For clone we only link a partial EOF block above the destination file's EOF.
*
* Shorten the request if possible.
*/
static int generic_remap_check_len(struct inode *inode_in,
struct inode *inode_out,
loff_t pos_out,
loff_t *len,
unsigned int remap_flags)
{
u64 blkmask = i_blocksize(inode_in) - 1;
loff_t new_len = *len;
if ((*len & blkmask) == 0)
return 0;
if ((remap_flags & REMAP_FILE_DEDUP) ||
pos_out + *len < i_size_read(inode_out))
new_len &= ~blkmask;
if (new_len == *len)
return 0;
if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
*len = new_len;
return 0;
}
return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
}
/*
* Read a page's worth of file data into the page cache. Return the page
* locked.
*/
static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
{
struct page *page;
page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
if (IS_ERR(page))
return page;
if (!PageUptodate(page)) {
put_page(page);
return ERR_PTR(-EIO);
}
lock_page(page);
return page;
}
/*
* Compare extents of two files to see if they are the same.
* Caller must have locked both inodes to prevent write races.
*/
static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
struct inode *dest, loff_t destoff,
loff_t len, bool *is_same)
{
loff_t src_poff;
loff_t dest_poff;
void *src_addr;
void *dest_addr;
struct page *src_page;
struct page *dest_page;
loff_t cmp_len;
bool same;
int error;
error = -EINVAL;
same = true;
while (len) {
src_poff = srcoff & (PAGE_SIZE - 1);
dest_poff = destoff & (PAGE_SIZE - 1);
cmp_len = min(PAGE_SIZE - src_poff,
PAGE_SIZE - dest_poff);
cmp_len = min(cmp_len, len);
if (cmp_len <= 0)
goto out_error;
src_page = vfs_dedupe_get_page(src, srcoff);
if (IS_ERR(src_page)) {
error = PTR_ERR(src_page);
goto out_error;
}
dest_page = vfs_dedupe_get_page(dest, destoff);
if (IS_ERR(dest_page)) {
error = PTR_ERR(dest_page);
unlock_page(src_page);
put_page(src_page);
goto out_error;
}
src_addr = kmap_atomic(src_page);
dest_addr = kmap_atomic(dest_page);
flush_dcache_page(src_page);
flush_dcache_page(dest_page);
if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
same = false;
kunmap_atomic(dest_addr);
kunmap_atomic(src_addr);
unlock_page(dest_page);
unlock_page(src_page);
put_page(dest_page);
put_page(src_page);
if (!same)
break;
srcoff += cmp_len;
destoff += cmp_len;
len -= cmp_len;
}
*is_same = same;
return 0;
out_error:
return error;
}
/* /*
* Check that the two inodes are eligible for cloning, the ranges make * Check that the two inodes are eligible for cloning, the ranges make
* sense, and then flush all dirty data. Caller must ensure that the * sense, and then flush all dirty data. Caller must ensure that the
* inodes have been locked against any other modifications. * inodes have been locked against any other modifications.
* *
* Returns: 0 for "nothing to clone", 1 for "something to clone", or * If there's an error, then the usual negative error code is returned.
* the usual negative error code. * Otherwise returns 0 with *len set to the request length.
*/ */
int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct inode *inode_out, loff_t pos_out, struct file *file_out, loff_t pos_out,
u64 *len, bool is_dedupe) loff_t *len, unsigned int remap_flags)
{ {
loff_t bs = inode_out->i_sb->s_blocksize; struct inode *inode_in = file_inode(file_in);
loff_t blen; struct inode *inode_out = file_inode(file_out);
loff_t isize;
bool same_inode = (inode_in == inode_out); bool same_inode = (inode_in == inode_out);
int ret; int ret;
...@@ -1739,50 +1872,24 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, ...@@ -1739,50 +1872,24 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL; return -EINVAL;
/* Are we going all the way to the end? */
isize = i_size_read(inode_in);
if (isize == 0)
return 0;
/* Zero length dedupe exits immediately; reflink goes to EOF. */ /* Zero length dedupe exits immediately; reflink goes to EOF. */
if (*len == 0) { if (*len == 0) {
if (is_dedupe || pos_in == isize) loff_t isize = i_size_read(inode_in);
if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
return 0; return 0;
if (pos_in > isize) if (pos_in > isize)
return -EINVAL; return -EINVAL;
*len = isize - pos_in; *len = isize - pos_in;
if (*len == 0)
return 0;
} }
/* Ensure offsets don't wrap and the input is inside i_size */ /* Check that we don't violate system file offset limits. */
if (pos_in + *len < pos_in || pos_out + *len < pos_out || ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
pos_in + *len > isize) remap_flags);
return -EINVAL; if (ret)
return ret;
/* Don't allow dedupe past EOF in the dest file */
if (is_dedupe) {
loff_t disize;
disize = i_size_read(inode_out);
if (pos_out >= disize || pos_out + *len > disize)
return -EINVAL;
}
/* If we're linking to EOF, continue to the block boundary. */
if (pos_in + *len == isize)
blen = ALIGN(isize, bs) - pos_in;
else
blen = *len;
/* Only reflink if we're aligned to block boundaries */
if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
!IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
return -EINVAL;
/* Don't allow overlapped reflink within the same file */
if (same_inode) {
if (pos_out + blen > pos_in && pos_out < pos_in + blen)
return -EINVAL;
}
/* Wait for the completion of any pending IOs on both files */ /* Wait for the completion of any pending IOs on both files */
inode_dio_wait(inode_in); inode_dio_wait(inode_in);
...@@ -1802,7 +1909,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, ...@@ -1802,7 +1909,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
/* /*
* Check that the extents are the same. * Check that the extents are the same.
*/ */
if (is_dedupe) { if (remap_flags & REMAP_FILE_DEDUP) {
bool is_same = false; bool is_same = false;
ret = vfs_dedupe_file_range_compare(inode_in, pos_in, ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
...@@ -1813,16 +1920,43 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, ...@@ -1813,16 +1920,43 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
return -EBADE; return -EBADE;
} }
return 1; ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
remap_flags);
if (ret)
return ret;
/* If can't alter the file contents, we're done. */
if (!(remap_flags & REMAP_FILE_DEDUP)) {
/* Update the timestamps, since we can alter file contents. */
if (!(file_out->f_mode & FMODE_NOCMTIME)) {
ret = file_update_time(file_out);
if (ret)
return ret;
}
/*
* Clear the security bits if the process is not being run by
* root. This keeps people from modifying setuid and setgid
* binaries.
*/
ret = file_remove_privs(file_out);
if (ret)
return ret;
}
return 0;
} }
EXPORT_SYMBOL(vfs_clone_file_prep_inodes); EXPORT_SYMBOL(generic_remap_file_range_prep);
int do_clone_file_range(struct file *file_in, loff_t pos_in, loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len) struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags)
{ {
struct inode *inode_in = file_inode(file_in); struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out); struct inode *inode_out = file_inode(file_out);
int ret; loff_t ret;
WARN_ON_ONCE(remap_flags);
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR; return -EISDIR;
...@@ -1842,140 +1976,43 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in, ...@@ -1842,140 +1976,43 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in,
(file_out->f_flags & O_APPEND)) (file_out->f_flags & O_APPEND))
return -EBADF; return -EBADF;
if (!file_in->f_op->clone_file_range) if (!file_in->f_op->remap_file_range)
return -EOPNOTSUPP; return -EOPNOTSUPP;
ret = clone_verify_area(file_in, pos_in, len, false); ret = remap_verify_area(file_in, pos_in, len, false);
if (ret) if (ret)
return ret; return ret;
ret = clone_verify_area(file_out, pos_out, len, true); ret = remap_verify_area(file_out, pos_out, len, true);
if (ret) if (ret)
return ret; return ret;
if (pos_in + len > i_size_read(inode_in)) ret = file_in->f_op->remap_file_range(file_in, pos_in,
return -EINVAL; file_out, pos_out, len, remap_flags);
if (ret < 0)
ret = file_in->f_op->clone_file_range(file_in, pos_in, return ret;
file_out, pos_out, len);
if (!ret) {
fsnotify_access(file_in);
fsnotify_modify(file_out);
}
fsnotify_access(file_in);
fsnotify_modify(file_out);
return ret; return ret;
} }
EXPORT_SYMBOL(do_clone_file_range); EXPORT_SYMBOL(do_clone_file_range);
int vfs_clone_file_range(struct file *file_in, loff_t pos_in, loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len) struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags)
{ {
int ret; loff_t ret;
file_start_write(file_out); file_start_write(file_out);
ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len); ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
remap_flags);
file_end_write(file_out); file_end_write(file_out);
return ret; return ret;
} }
EXPORT_SYMBOL(vfs_clone_file_range); EXPORT_SYMBOL(vfs_clone_file_range);
/*
* Read a page's worth of file data into the page cache. Return the page
* locked.
*/
static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
{
struct address_space *mapping;
struct page *page;
pgoff_t n;
n = offset >> PAGE_SHIFT;
mapping = inode->i_mapping;
page = read_mapping_page(mapping, n, NULL);
if (IS_ERR(page))
return page;
if (!PageUptodate(page)) {
put_page(page);
return ERR_PTR(-EIO);
}
lock_page(page);
return page;
}
/*
* Compare extents of two files to see if they are the same.
* Caller must have locked both inodes to prevent write races.
*/
int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
struct inode *dest, loff_t destoff,
loff_t len, bool *is_same)
{
loff_t src_poff;
loff_t dest_poff;
void *src_addr;
void *dest_addr;
struct page *src_page;
struct page *dest_page;
loff_t cmp_len;
bool same;
int error;
error = -EINVAL;
same = true;
while (len) {
src_poff = srcoff & (PAGE_SIZE - 1);
dest_poff = destoff & (PAGE_SIZE - 1);
cmp_len = min(PAGE_SIZE - src_poff,
PAGE_SIZE - dest_poff);
cmp_len = min(cmp_len, len);
if (cmp_len <= 0)
goto out_error;
src_page = vfs_dedupe_get_page(src, srcoff);
if (IS_ERR(src_page)) {
error = PTR_ERR(src_page);
goto out_error;
}
dest_page = vfs_dedupe_get_page(dest, destoff);
if (IS_ERR(dest_page)) {
error = PTR_ERR(dest_page);
unlock_page(src_page);
put_page(src_page);
goto out_error;
}
src_addr = kmap_atomic(src_page);
dest_addr = kmap_atomic(dest_page);
flush_dcache_page(src_page);
flush_dcache_page(dest_page);
if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
same = false;
kunmap_atomic(dest_addr);
kunmap_atomic(src_addr);
unlock_page(dest_page);
unlock_page(src_page);
put_page(dest_page);
put_page(src_page);
if (!same)
break;
srcoff += cmp_len;
destoff += cmp_len;
len -= cmp_len;
}
*is_same = same;
return 0;
out_error:
return error;
}
EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
/* Check whether we are allowed to dedupe the destination file */ /* Check whether we are allowed to dedupe the destination file */
static bool allow_file_dedupe(struct file *file) static bool allow_file_dedupe(struct file *file)
{ {
...@@ -1990,16 +2027,20 @@ static bool allow_file_dedupe(struct file *file) ...@@ -1990,16 +2027,20 @@ static bool allow_file_dedupe(struct file *file)
return false; return false;
} }
int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
struct file *dst_file, loff_t dst_pos, u64 len) struct file *dst_file, loff_t dst_pos,
loff_t len, unsigned int remap_flags)
{ {
s64 ret; loff_t ret;
WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
REMAP_FILE_CAN_SHORTEN));
ret = mnt_want_write_file(dst_file); ret = mnt_want_write_file(dst_file);
if (ret) if (ret)
return ret; return ret;
ret = clone_verify_area(dst_file, dst_pos, len, true); ret = remap_verify_area(dst_file, dst_pos, len, true);
if (ret < 0) if (ret < 0)
goto out_drop_write; goto out_drop_write;
...@@ -2016,11 +2057,16 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, ...@@ -2016,11 +2057,16 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
goto out_drop_write; goto out_drop_write;
ret = -EINVAL; ret = -EINVAL;
if (!dst_file->f_op->dedupe_file_range) if (!dst_file->f_op->remap_file_range)
goto out_drop_write; goto out_drop_write;
ret = dst_file->f_op->dedupe_file_range(src_file, src_pos, if (len == 0) {
dst_file, dst_pos, len); ret = 0;
goto out_drop_write;
}
ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
out_drop_write: out_drop_write:
mnt_drop_write_file(dst_file); mnt_drop_write_file(dst_file);
...@@ -2037,7 +2083,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) ...@@ -2037,7 +2083,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
int i; int i;
int ret; int ret;
u16 count = same->dest_count; u16 count = same->dest_count;
int deduped; loff_t deduped;
if (!(file->f_mode & FMODE_READ)) if (!(file->f_mode & FMODE_READ))
return -EINVAL; return -EINVAL;
...@@ -2056,7 +2102,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) ...@@ -2056,7 +2102,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
if (!S_ISREG(src->i_mode)) if (!S_ISREG(src->i_mode))
goto out; goto out;
ret = clone_verify_area(file, off, len, false); ret = remap_verify_area(file, off, len, false);
if (ret < 0) if (ret < 0)
goto out; goto out;
ret = 0; ret = 0;
...@@ -2088,7 +2134,8 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) ...@@ -2088,7 +2134,8 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
} }
deduped = vfs_dedupe_file_range_one(file, off, dst_file, deduped = vfs_dedupe_file_range_one(file, off, dst_file,
info->dest_offset, len); info->dest_offset, len,
REMAP_FILE_CAN_SHORTEN);
if (deduped == -EBADE) if (deduped == -EBADE)
info->status = FILE_DEDUPE_RANGE_DIFFERS; info->status = FILE_DEDUPE_RANGE_DIFFERS;
else if (deduped < 0) else if (deduped < 0)
......
...@@ -919,28 +919,67 @@ xfs_file_fallocate( ...@@ -919,28 +919,67 @@ xfs_file_fallocate(
return error; return error;
} }
STATIC int
xfs_file_clone_range(
struct file *file_in,
loff_t pos_in,
struct file *file_out,
loff_t pos_out,
u64 len)
{
return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
len, false);
}
STATIC int loff_t
xfs_file_dedupe_range( xfs_file_remap_range(
struct file *file_in, struct file *file_in,
loff_t pos_in, loff_t pos_in,
struct file *file_out, struct file *file_out,
loff_t pos_out, loff_t pos_out,
u64 len) loff_t len,
unsigned int remap_flags)
{ {
return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, struct inode *inode_in = file_inode(file_in);
len, true); struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
struct xfs_mount *mp = src->i_mount;
loff_t remapped = 0;
xfs_extlen_t cowextsize;
int ret;
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return -EOPNOTSUPP;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
/* Prepare and then clone file data. */
ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
&len, remap_flags);
if (ret < 0 || len == 0)
return ret;
trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
&remapped);
if (ret)
goto out_unlock;
/*
* Carry the cowextsize hint from src to dest if we're sharing the
* entire source file to the entire destination file, the source file
* has a cowextsize hint, and the destination file does not.
*/
cowextsize = 0;
if (pos_in == 0 && len == i_size_read(inode_in) &&
(src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
pos_out == 0 && len >= i_size_read(inode_out) &&
!(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
cowextsize = src->i_d.di_cowextsize;
ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
remap_flags);
out_unlock:
xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return remapped > 0 ? remapped : ret;
} }
STATIC int STATIC int
...@@ -1175,8 +1214,7 @@ const struct file_operations xfs_file_operations = { ...@@ -1175,8 +1214,7 @@ const struct file_operations xfs_file_operations = {
.fsync = xfs_file_fsync, .fsync = xfs_file_fsync,
.get_unmapped_area = thp_get_unmapped_area, .get_unmapped_area = thp_get_unmapped_area,
.fallocate = xfs_file_fallocate, .fallocate = xfs_file_fallocate,
.clone_file_range = xfs_file_clone_range, .remap_file_range = xfs_file_remap_range,
.dedupe_file_range = xfs_file_dedupe_range,
}; };
const struct file_operations xfs_dir_file_operations = { const struct file_operations xfs_dir_file_operations = {
......
...@@ -913,18 +913,18 @@ xfs_reflink_set_inode_flag( ...@@ -913,18 +913,18 @@ xfs_reflink_set_inode_flag(
/* /*
* Update destination inode size & cowextsize hint, if necessary. * Update destination inode size & cowextsize hint, if necessary.
*/ */
STATIC int int
xfs_reflink_update_dest( xfs_reflink_update_dest(
struct xfs_inode *dest, struct xfs_inode *dest,
xfs_off_t newlen, xfs_off_t newlen,
xfs_extlen_t cowextsize, xfs_extlen_t cowextsize,
bool is_dedupe) unsigned int remap_flags)
{ {
struct xfs_mount *mp = dest->i_mount; struct xfs_mount *mp = dest->i_mount;
struct xfs_trans *tp; struct xfs_trans *tp;
int error; int error;
if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
return 0; return 0;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
...@@ -945,10 +945,6 @@ xfs_reflink_update_dest( ...@@ -945,10 +945,6 @@ xfs_reflink_update_dest(
dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
} }
if (!is_dedupe) {
xfs_trans_ichgtime(tp, dest,
XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
}
xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
error = xfs_trans_commit(tp); error = xfs_trans_commit(tp);
...@@ -1112,19 +1108,28 @@ xfs_reflink_remap_extent( ...@@ -1112,19 +1108,28 @@ xfs_reflink_remap_extent(
/* /*
* Iteratively remap one file's extents (and holes) to another's. * Iteratively remap one file's extents (and holes) to another's.
*/ */
STATIC int int
xfs_reflink_remap_blocks( xfs_reflink_remap_blocks(
struct xfs_inode *src, struct xfs_inode *src,
xfs_fileoff_t srcoff, loff_t pos_in,
struct xfs_inode *dest, struct xfs_inode *dest,
xfs_fileoff_t destoff, loff_t pos_out,
xfs_filblks_t len, loff_t remap_len,
xfs_off_t new_isize) loff_t *remapped)
{ {
struct xfs_bmbt_irec imap; struct xfs_bmbt_irec imap;
xfs_fileoff_t srcoff;
xfs_fileoff_t destoff;
xfs_filblks_t len;
xfs_filblks_t range_len;
xfs_filblks_t remapped_len = 0;
xfs_off_t new_isize = pos_out + remap_len;
int nimaps; int nimaps;
int error = 0; int error = 0;
xfs_filblks_t range_len;
destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
len = XFS_B_TO_FSB(src->i_mount, remap_len);
/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
while (len) { while (len) {
...@@ -1139,7 +1144,7 @@ xfs_reflink_remap_blocks( ...@@ -1139,7 +1144,7 @@ xfs_reflink_remap_blocks(
error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
xfs_iunlock(src, lock_mode); xfs_iunlock(src, lock_mode);
if (error) if (error)
goto err; break;
ASSERT(nimaps == 1); ASSERT(nimaps == 1);
trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
...@@ -1153,23 +1158,24 @@ xfs_reflink_remap_blocks( ...@@ -1153,23 +1158,24 @@ xfs_reflink_remap_blocks(
error = xfs_reflink_remap_extent(dest, &imap, destoff, error = xfs_reflink_remap_extent(dest, &imap, destoff,
new_isize); new_isize);
if (error) if (error)
goto err; break;
if (fatal_signal_pending(current)) { if (fatal_signal_pending(current)) {
error = -EINTR; error = -EINTR;
goto err; break;
} }
/* Advance drange/srange */ /* Advance drange/srange */
srcoff += range_len; srcoff += range_len;
destoff += range_len; destoff += range_len;
len -= range_len; len -= range_len;
remapped_len += range_len;
} }
return 0; if (error)
trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
err: *remapped = min_t(loff_t, remap_len,
trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); XFS_FSB_TO_B(src->i_mount, remapped_len));
return error; return error;
} }
...@@ -1218,7 +1224,7 @@ xfs_iolock_two_inodes_and_break_layout( ...@@ -1218,7 +1224,7 @@ xfs_iolock_two_inodes_and_break_layout(
} }
/* Unlock both inodes after they've been prepped for a range clone. */ /* Unlock both inodes after they've been prepped for a range clone. */
STATIC void void
xfs_reflink_remap_unlock( xfs_reflink_remap_unlock(
struct file *file_in, struct file *file_in,
struct file *file_out) struct file *file_out)
...@@ -1286,21 +1292,20 @@ xfs_reflink_zero_posteof( ...@@ -1286,21 +1292,20 @@ xfs_reflink_zero_posteof(
* stale data in the destination file. Hence we reject these clone attempts with * stale data in the destination file. Hence we reject these clone attempts with
* -EINVAL in this case. * -EINVAL in this case.
*/ */
STATIC int int
xfs_reflink_remap_prep( xfs_reflink_remap_prep(
struct file *file_in, struct file *file_in,
loff_t pos_in, loff_t pos_in,
struct file *file_out, struct file *file_out,
loff_t pos_out, loff_t pos_out,
u64 *len, loff_t *len,
bool is_dedupe) unsigned int remap_flags)
{ {
struct inode *inode_in = file_inode(file_in); struct inode *inode_in = file_inode(file_in);
struct xfs_inode *src = XFS_I(inode_in); struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out); struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out); struct xfs_inode *dest = XFS_I(inode_out);
bool same_inode = (inode_in == inode_out); bool same_inode = (inode_in == inode_out);
u64 blkmask = i_blocksize(inode_in) - 1;
ssize_t ret; ssize_t ret;
/* Lock both files against IO */ /* Lock both files against IO */
...@@ -1323,29 +1328,11 @@ xfs_reflink_remap_prep( ...@@ -1323,29 +1328,11 @@ xfs_reflink_remap_prep(
if (IS_DAX(inode_in) || IS_DAX(inode_out)) if (IS_DAX(inode_in) || IS_DAX(inode_out))
goto out_unlock; goto out_unlock;
ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
len, is_dedupe); len, remap_flags);
if (ret <= 0) if (ret < 0 || *len == 0)
goto out_unlock; goto out_unlock;
/*
* If the dedupe data matches, chop off the partial EOF block
* from the source file so we don't try to dedupe the partial
* EOF block.
*/
if (is_dedupe) {
*len &= ~blkmask;
} else if (*len & blkmask) {
/*
* The user is attempting to share a partial EOF block,
* if it's inside the destination EOF then reject it.
*/
if (pos_out + *len < i_size_read(inode_out)) {
ret = -EINVAL;
goto out_unlock;
}
}
/* Attach dquots to dest inode before changing block map */ /* Attach dquots to dest inode before changing block map */
ret = xfs_qm_dqattach(dest); ret = xfs_qm_dqattach(dest);
if (ret) if (ret)
...@@ -1365,31 +1352,9 @@ xfs_reflink_remap_prep( ...@@ -1365,31 +1352,9 @@ xfs_reflink_remap_prep(
goto out_unlock; goto out_unlock;
/* Zap any page cache for the destination file's range. */ /* Zap any page cache for the destination file's range. */
truncate_inode_pages_range(&inode_out->i_data, pos_out, truncate_inode_pages_range(&inode_out->i_data,
PAGE_ALIGN(pos_out + *len) - 1); round_down(pos_out, PAGE_SIZE),
round_up(pos_out + *len, PAGE_SIZE) - 1);
/* If we're altering the file contents... */
if (!is_dedupe) {
/*
* ...update the timestamps (which will grab the ilock again
* from xfs_fs_dirty_inode, so we have to call it before we
* take the ilock).
*/
if (!(file_out->f_mode & FMODE_NOCMTIME)) {
ret = file_update_time(file_out);
if (ret)
goto out_unlock;
}
/*
* ...clear the security bits if the process is not being run
* by root. This keeps people from modifying setuid and setgid
* binaries.
*/
ret = file_remove_privs(file_out);
if (ret)
goto out_unlock;
}
return 1; return 1;
out_unlock: out_unlock:
...@@ -1397,72 +1362,6 @@ xfs_reflink_remap_prep( ...@@ -1397,72 +1362,6 @@ xfs_reflink_remap_prep(
return ret; return ret;
} }
/*
* Link a range of blocks from one file to another.
*/
int
xfs_reflink_remap_range(
struct file *file_in,
loff_t pos_in,
struct file *file_out,
loff_t pos_out,
u64 len,
bool is_dedupe)
{
struct inode *inode_in = file_inode(file_in);
struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
struct xfs_mount *mp = src->i_mount;
xfs_fileoff_t sfsbno, dfsbno;
xfs_filblks_t fsblen;
xfs_extlen_t cowextsize;
ssize_t ret;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return -EOPNOTSUPP;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
/* Prepare and then clone file data. */
ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
&len, is_dedupe);
if (ret <= 0)
return ret;
trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
dfsbno = XFS_B_TO_FSBT(mp, pos_out);
sfsbno = XFS_B_TO_FSBT(mp, pos_in);
fsblen = XFS_B_TO_FSB(mp, len);
ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
pos_out + len);
if (ret)
goto out_unlock;
/*
* Carry the cowextsize hint from src to dest if we're sharing the
* entire source file to the entire destination file, the source file
* has a cowextsize hint, and the destination file does not.
*/
cowextsize = 0;
if (pos_in == 0 && len == i_size_read(inode_in) &&
(src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
pos_out == 0 && len >= i_size_read(inode_out) &&
!(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
cowextsize = src->i_d.di_cowextsize;
ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
is_dedupe);
out_unlock:
xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return ret;
}
/* /*
* The user wants to preemptively CoW all shared blocks in this file, * The user wants to preemptively CoW all shared blocks in this file,
* which enables us to turn off the reflink flag. Iterate all * which enables us to turn off the reflink flag. Iterate all
......
...@@ -27,13 +27,24 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, ...@@ -27,13 +27,24 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count); xfs_off_t count);
extern int xfs_reflink_recover_cow(struct xfs_mount *mp); extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe); struct file *file_out, loff_t pos_out, loff_t len,
unsigned int remap_flags);
extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp, extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp,
struct xfs_inode *ip, bool *has_shared); struct xfs_inode *ip, bool *has_shared);
extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
struct xfs_trans **tpp); struct xfs_trans **tpp);
extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len); xfs_off_t len);
extern int xfs_reflink_remap_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, loff_t *len,
unsigned int remap_flags);
extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
struct xfs_inode *dest, loff_t pos_out, loff_t remap_len,
loff_t *remapped);
extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
xfs_extlen_t cowextsize, unsigned int remap_flags);
extern void xfs_reflink_remap_unlock(struct file *file_in,
struct file *file_out);
#endif /* __XFS_REFLINK_H */ #endif /* __XFS_REFLINK_H */
...@@ -1752,6 +1752,25 @@ struct block_device_operations; ...@@ -1752,6 +1752,25 @@ struct block_device_operations;
#define NOMMU_VMFLAGS \ #define NOMMU_VMFLAGS \
(NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC) (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)
/*
* These flags control the behavior of the remap_file_range function pointer.
* If it is called with len == 0 that means "remap to end of source file".
* See Documentation/filesystems/vfs.txt for more details about this call.
*
* REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate)
* REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request
*/
#define REMAP_FILE_DEDUP (1 << 0)
#define REMAP_FILE_CAN_SHORTEN (1 << 1)
/*
* These flags signal that the caller is ok with altering various aspects of
* the behavior of the remap operation. The changes must be made by the
* implementation; the vfs remap helper functions can take advantage of them.
* Flags in this category exist to preserve the quirky behavior of the hoisted
* btrfs clone/dedupe ioctls.
*/
#define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN)
struct iov_iter; struct iov_iter;
...@@ -1790,10 +1809,9 @@ struct file_operations { ...@@ -1790,10 +1809,9 @@ struct file_operations {
#endif #endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
loff_t, size_t, unsigned int); loff_t, size_t, unsigned int);
int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
u64); struct file *file_out, loff_t pos_out,
int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, loff_t len, unsigned int remap_flags);
u64);
int (*fadvise)(struct file *, loff_t, loff_t, int); int (*fadvise)(struct file *, loff_t, loff_t, int);
} __randomize_layout; } __randomize_layout;
...@@ -1856,21 +1874,21 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *, ...@@ -1856,21 +1874,21 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
unsigned long, loff_t *, rwf_t); unsigned long, loff_t *, rwf_t);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
loff_t, size_t, unsigned int); loff_t, size_t, unsigned int);
extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct inode *inode_out, loff_t pos_out, struct file *file_out, loff_t pos_out,
u64 *len, bool is_dedupe); loff_t *count,
extern int do_clone_file_range(struct file *file_in, loff_t pos_in, unsigned int remap_flags);
struct file *file_out, loff_t pos_out, u64 len); extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out,
struct file *file_out, loff_t pos_out, u64 len); loff_t len, unsigned int remap_flags);
extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
struct inode *dest, loff_t destoff, struct file *file_out, loff_t pos_out,
loff_t len, bool *is_same); loff_t len, unsigned int remap_flags);
extern int vfs_dedupe_file_range(struct file *file, extern int vfs_dedupe_file_range(struct file *file,
struct file_dedupe_range *same); struct file_dedupe_range *same);
extern int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
struct file *dst_file, loff_t dst_pos, struct file *dst_file, loff_t dst_pos,
u64 len); loff_t len, unsigned int remap_flags);
struct super_operations { struct super_operations {
...@@ -2998,6 +3016,9 @@ extern int sb_min_blocksize(struct super_block *, int); ...@@ -2998,6 +3016,9 @@ extern int sb_min_blocksize(struct super_block *, int);
extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *count, unsigned int remap_flags);
extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
......
...@@ -2824,6 +2824,42 @@ struct page *read_cache_page_gfp(struct address_space *mapping, ...@@ -2824,6 +2824,42 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
} }
EXPORT_SYMBOL(read_cache_page_gfp); EXPORT_SYMBOL(read_cache_page_gfp);
/*
* Don't operate on ranges the page cache doesn't support, and don't exceed the
* LFS limits. If pos is under the limit it becomes a short access. If it
* exceeds the limit we return -EFBIG.
*/
static int generic_access_check_limits(struct file *file, loff_t pos,
loff_t *count)
{
struct inode *inode = file->f_mapping->host;
loff_t max_size = inode->i_sb->s_maxbytes;
if (!(file->f_flags & O_LARGEFILE))
max_size = MAX_NON_LFS;
if (unlikely(pos >= max_size))
return -EFBIG;
*count = min(*count, max_size - pos);
return 0;
}
static int generic_write_check_limits(struct file *file, loff_t pos,
loff_t *count)
{
loff_t limit = rlimit(RLIMIT_FSIZE);
if (limit != RLIM_INFINITY) {
if (pos >= limit) {
send_sig(SIGXFSZ, current, 0);
return -EFBIG;
}
*count = min(*count, limit - pos);
}
return generic_access_check_limits(file, pos, count);
}
/* /*
* Performs necessary checks before doing a write * Performs necessary checks before doing a write
* *
...@@ -2835,8 +2871,8 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) ...@@ -2835,8 +2871,8 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
{ {
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host; struct inode *inode = file->f_mapping->host;
unsigned long limit = rlimit(RLIMIT_FSIZE); loff_t count;
loff_t pos; int ret;
if (!iov_iter_count(from)) if (!iov_iter_count(from))
return 0; return 0;
...@@ -2845,43 +2881,99 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) ...@@ -2845,43 +2881,99 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_APPEND) if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode); iocb->ki_pos = i_size_read(inode);
pos = iocb->ki_pos;
if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
return -EINVAL; return -EINVAL;
if (limit != RLIM_INFINITY) { count = iov_iter_count(from);
if (iocb->ki_pos >= limit) { ret = generic_write_check_limits(file, iocb->ki_pos, &count);
send_sig(SIGXFSZ, current, 0); if (ret)
return -EFBIG; return ret;
}
iov_iter_truncate(from, limit - (unsigned long)pos); iov_iter_truncate(from, count);
} return iov_iter_count(from);
}
EXPORT_SYMBOL(generic_write_checks);
/*
* Performs necessary checks before doing a clone.
*
* Can adjust amount of bytes to clone.
* Returns appropriate error code that caller should return or
* zero in case the clone should be allowed.
*/
int generic_remap_checks(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *req_count, unsigned int remap_flags)
{
struct inode *inode_in = file_in->f_mapping->host;
struct inode *inode_out = file_out->f_mapping->host;
uint64_t count = *req_count;
uint64_t bcount;
loff_t size_in, size_out;
loff_t bs = inode_out->i_sb->s_blocksize;
int ret;
/* The start of both ranges must be aligned to an fs block. */
if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
return -EINVAL;
/* Ensure offsets don't wrap. */
if (pos_in + count < pos_in || pos_out + count < pos_out)
return -EINVAL;
size_in = i_size_read(inode_in);
size_out = i_size_read(inode_out);
/* Dedupe requires both ranges to be within EOF. */
if ((remap_flags & REMAP_FILE_DEDUP) &&
(pos_in >= size_in || pos_in + count > size_in ||
pos_out >= size_out || pos_out + count > size_out))
return -EINVAL;
/* Ensure the infile range is within the infile. */
if (pos_in >= size_in)
return -EINVAL;
count = min(count, size_in - (uint64_t)pos_in);
ret = generic_access_check_limits(file_in, pos_in, &count);
if (ret)
return ret;
ret = generic_write_check_limits(file_out, pos_out, &count);
if (ret)
return ret;
/* /*
* LFS rule * If the user wanted us to link to the infile's EOF, round up to the
* next block boundary for this check.
*
* Otherwise, make sure the count is also block-aligned, having
* already confirmed the starting offsets' block alignment.
*/ */
if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && if (pos_in + count == size_in) {
!(file->f_flags & O_LARGEFILE))) { bcount = ALIGN(size_in, bs) - pos_in;
if (pos >= MAX_NON_LFS) } else {
return -EFBIG; if (!IS_ALIGNED(count, bs))
iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); count = ALIGN_DOWN(count, bs);
bcount = count;
} }
/* Don't allow overlapped cloning within the same file. */
if (inode_in == inode_out &&
pos_out + bcount > pos_in &&
pos_out < pos_in + bcount)
return -EINVAL;
/* /*
* Are we about to exceed the fs block limit ? * We shortened the request but the caller can't deal with that, so
* * bounce the request back to userspace.
* If we have written data it becomes a short write. If we have
* exceeded without writing data we send a signal and return EFBIG.
* Linus frestrict idea will clean these up nicely..
*/ */
if (unlikely(pos >= inode->i_sb->s_maxbytes)) if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
return -EFBIG; return -EINVAL;
iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); *req_count = count;
return iov_iter_count(from); return 0;
} }
EXPORT_SYMBOL(generic_write_checks);
int pagecache_write_begin(struct file *file, struct address_space *mapping, int pagecache_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags, loff_t pos, unsigned len, unsigned flags,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment