Commit 725737e7 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'statx-dioalign-for-linus' of...

Merge tag 'statx-dioalign-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux

Pull STATX_DIOALIGN support from Eric Biggers:
 "Make statx() support reporting direct I/O (DIO) alignment information.

  This provides a generic interface for userspace programs to determine
  whether a file supports DIO, and if so with what alignment
  restrictions. Specifically, STATX_DIOALIGN works on block devices, and
  on regular files when their containing filesystem has implemented
  support.

  An interface like this has been requested for years, since the
  conditions for when DIO is supported in Linux have gotten increasingly
  complex over time. Today, DIO support and alignment requirements can
  be affected by various filesystem features such as multi-device
  support, data journalling, inline data, encryption, verity,
  compression, checkpoint disabling, log-structured mode, etc.

  Further complicating things, Linux v6.0 relaxed the traditional rule
  of DIO needing to be aligned to the block device's logical block size;
  now user buffers (but not file offsets) only need to be aligned to the
  DMA alignment.

  The approach of uplifting the XFS specific ioctl XFS_IOC_DIOINFO was
  discarded in favor of creating a clean new interface with statx().

  For more information, see the individual commits and the man page
  update[1]"

Link: https://lore.kernel.org/r/20220722074229.148925-1-ebiggers@kernel.org [1]

* tag 'statx-dioalign-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux:
  xfs: support STATX_DIOALIGN
  f2fs: support STATX_DIOALIGN
  f2fs: simplify f2fs_force_buffered_io()
  f2fs: move f2fs_force_buffered_io() into file.c
  ext4: support STATX_DIOALIGN
  fscrypt: change fscrypt_dio_supported() to prepare for STATX_DIOALIGN
  vfs: support STATX_DIOALIGN on block devices
  statx: add direct I/O alignment information
parents 5779aa2d 61a223df
......@@ -26,6 +26,7 @@
#include <linux/namei.h>
#include <linux/part_stat.h>
#include <linux/uaccess.h>
#include <linux/stat.h>
#include "../fs/internal.h"
#include "blk.h"
......@@ -1069,3 +1070,25 @@ void sync_bdevs(bool wait)
spin_unlock(&blockdev_superblock->s_inode_list_lock);
iput(old_inode);
}
/*
* Handle STATX_DIOALIGN for block devices.
*
* Note that the inode passed to this is the inode of a block device node file,
* not the block device's internal inode. Therefore it is *not* valid to use
* I_BDEV() here; the block device has to be looked up by i_rdev instead.
*/
void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
{
struct block_device *bdev;
bdev = blkdev_get_no_open(inode->i_rdev);
if (!bdev)
return;
stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
stat->dio_offset_align = bdev_logical_block_size(bdev);
stat->result_mask |= STATX_DIOALIGN;
blkdev_put_no_open(bdev);
}
......@@ -396,46 +396,45 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio,
EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh);
/**
* fscrypt_dio_supported() - check whether a DIO (direct I/O) request is
* supported as far as encryption is concerned
* @iocb: the file and position the I/O is targeting
* @iter: the I/O data segment(s)
* fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an
* inode, as far as encryption is concerned
* @inode: the inode in question
*
* Return: %true if there are no encryption constraints that prevent DIO from
* being supported; %false if DIO is unsupported. (Note that in the
* %true case, the filesystem might have other, non-encryption-related
* constraints that prevent DIO from actually being supported.)
* constraints that prevent DIO from actually being supported. Also, on
* encrypted files the filesystem is still responsible for only allowing
* DIO when requests are filesystem-block-aligned.)
*/
bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
bool fscrypt_dio_supported(struct inode *inode)
{
const struct inode *inode = file_inode(iocb->ki_filp);
const unsigned int blocksize = i_blocksize(inode);
int err;
/* If the file is unencrypted, no veto from us. */
if (!fscrypt_needs_contents_encryption(inode))
return true;
/* We only support DIO with inline crypto, not fs-layer crypto. */
if (!fscrypt_inode_uses_inline_crypto(inode))
return false;
/*
* Since the granularity of encryption is filesystem blocks, the file
* position and total I/O length must be aligned to the filesystem block
* size -- not just to the block device's logical block size as is
* traditionally the case for DIO on many filesystems.
* We only support DIO with inline crypto, not fs-layer crypto.
*
* We require that the user-provided memory buffers be filesystem block
* aligned too. It is simpler to have a single alignment value required
* for all properties of the I/O, as is normally the case for DIO.
* Also, allowing less aligned buffers would imply that data units could
* cross bvecs, which would greatly complicate the I/O stack, which
* assumes that bios can be split at any bvec boundary.
* To determine whether the inode is using inline crypto, we have to set
* up the key if it wasn't already done. This is because in the current
* design of fscrypt, the decision of whether to use inline crypto or
* not isn't made until the inode's encryption key is being set up. In
* the DIO read/write case, the key will always be set up already, since
* the file will be open. But in the case of statx(), the key might not
* be set up yet, as the file might not have been opened yet.
*/
err = fscrypt_require_key(inode);
if (err) {
/*
* Key unavailable or couldn't be set up. This edge case isn't
* worth worrying about; just report that DIO is unsupported.
*/
if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize))
return false;
return true;
}
return fscrypt_inode_uses_inline_crypto(inode);
}
EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
......
......@@ -2977,6 +2977,7 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
extern int ext4_write_inode(struct inode *, struct writeback_control *);
extern int ext4_setattr(struct user_namespace *, struct dentry *,
struct iattr *);
extern u32 ext4_dio_alignment(struct inode *inode);
extern int ext4_getattr(struct user_namespace *, const struct path *,
struct kstat *, u32, unsigned int);
extern void ext4_evict_inode(struct inode *);
......
......@@ -36,19 +36,34 @@
#include "acl.h"
#include "truncate.h"
static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
/*
* Returns %true if the given DIO request should be attempted with DIO, or
* %false if it should fall back to buffered I/O.
*
* DIO isn't well specified; when it's unsupported (either due to the request
* being misaligned, or due to the file not supporting DIO at all), filesystems
* either fall back to buffered I/O or return EINVAL. For files that don't use
* any special features like encryption or verity, ext4 has traditionally
* returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too.
* In this case, we should attempt the DIO, *not* fall back to buffered I/O.
*
* In contrast, in cases where DIO is unsupported due to ext4 features, ext4
* traditionally falls back to buffered I/O.
*
* This function implements the traditional ext4 behavior in all these cases.
*/
static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
{
struct inode *inode = file_inode(iocb->ki_filp);
u32 dio_align = ext4_dio_alignment(inode);
if (!fscrypt_dio_supported(iocb, iter))
return false;
if (fsverity_active(inode))
return false;
if (ext4_should_journal_data(inode))
return false;
if (ext4_has_inline_data(inode))
if (dio_align == 0)
return false;
if (dio_align == 1)
return true;
return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align);
}
static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
......@@ -63,7 +78,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
inode_lock_shared(inode);
}
if (!ext4_dio_supported(iocb, to)) {
if (!ext4_should_use_dio(iocb, to)) {
inode_unlock_shared(inode);
/*
* Fallback to buffered I/O if the operation being performed on
......@@ -511,7 +526,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
/* Fallback to buffered I/O if the inode does not support direct I/O. */
if (!ext4_dio_supported(iocb, from)) {
if (!ext4_should_use_dio(iocb, from)) {
if (ilock_shared)
inode_unlock_shared(inode);
else
......
......@@ -5550,6 +5550,22 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return error;
}
u32 ext4_dio_alignment(struct inode *inode)
{
if (fsverity_active(inode))
return 0;
if (ext4_should_journal_data(inode))
return 0;
if (ext4_has_inline_data(inode))
return 0;
if (IS_ENCRYPTED(inode)) {
if (!fscrypt_dio_supported(inode))
return 0;
return i_blocksize(inode);
}
return 1; /* use the iomap defaults */
}
int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
......@@ -5565,6 +5581,27 @@ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
}
/*
* Return the DIO alignment restrictions if requested. We only return
* this information when requested, since on encrypted files it might
* take a fair bit of work to get if the file wasn't opened recently.
*/
if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
u32 dio_align = ext4_dio_alignment(inode);
stat->result_mask |= STATX_DIOALIGN;
if (dio_align == 1) {
struct block_device *bdev = inode->i_sb->s_bdev;
/* iomap defaults */
stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
stat->dio_offset_align = bdev_logical_block_size(bdev);
} else {
stat->dio_mem_align = dio_align;
stat->dio_offset_align = dio_align;
}
}
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
if (flags & EXT4_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
......
......@@ -4471,17 +4471,6 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode,
f2fs_mark_inode_dirty_sync(inode, true);
}
static inline int block_unaligned_IO(struct inode *inode,
struct kiocb *iocb, struct iov_iter *iter)
{
unsigned int i_blkbits = READ_ONCE(inode->i_blkbits);
unsigned int blocksize_mask = (1 << i_blkbits) - 1;
loff_t offset = iocb->ki_pos;
unsigned long align = offset | iov_iter_alignment(iter);
return align & blocksize_mask;
}
static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
int flag)
{
......@@ -4492,35 +4481,6 @@ static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
return sbi->aligned_blksize;
}
static inline bool f2fs_force_buffered_io(struct inode *inode,
struct kiocb *iocb, struct iov_iter *iter)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int rw = iov_iter_rw(iter);
if (!fscrypt_dio_supported(iocb, iter))
return true;
if (fsverity_active(inode))
return true;
if (f2fs_compressed_file(inode))
return true;
/* disallow direct IO if any of devices has unaligned blksize */
if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
return true;
if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
if (block_unaligned_IO(inode, iocb, iter))
return true;
if (F2FS_IO_ALIGNED(sbi))
return true;
}
if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
return true;
return false;
}
static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
{
return fsverity_active(inode) &&
......
......@@ -808,6 +808,29 @@ int f2fs_truncate(struct inode *inode)
return 0;
}
static bool f2fs_force_buffered_io(struct inode *inode, int rw)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
if (!fscrypt_dio_supported(inode))
return true;
if (fsverity_active(inode))
return true;
if (f2fs_compressed_file(inode))
return true;
/* disallow direct IO if any of devices has unaligned blksize */
if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
return true;
if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi))
return true;
if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
return true;
return false;
}
int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
......@@ -824,6 +847,24 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
stat->btime.tv_nsec = fi->i_crtime.tv_nsec;
}
/*
* Return the DIO alignment restrictions if requested. We only return
* this information when requested, since on encrypted files it might
* take a fair bit of work to get if the file wasn't opened recently.
*
* f2fs sometimes supports DIO reads but not DIO writes. STATX_DIOALIGN
* cannot represent that, so in that case we report no DIO support.
*/
if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
unsigned int bsize = i_blocksize(inode);
stat->result_mask |= STATX_DIOALIGN;
if (!f2fs_force_buffered_io(inode, WRITE)) {
stat->dio_mem_align = bsize;
stat->dio_offset_align = bsize;
}
}
flags = fi->i_flags;
if (flags & F2FS_COMPR_FL)
stat->attributes |= STATX_ATTR_COMPRESSED;
......@@ -4182,7 +4223,7 @@ static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb,
if (!(iocb->ki_flags & IOCB_DIRECT))
return false;
if (f2fs_force_buffered_io(inode, iocb, iter))
if (f2fs_force_buffered_io(inode, iov_iter_rw(iter)))
return false;
/*
......
......@@ -5,6 +5,7 @@
* Copyright (C) 1991, 1992 Linus Torvalds
*/
#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/errno.h>
......@@ -230,11 +231,22 @@ static int vfs_statx(int dfd, struct filename *filename, int flags,
goto out;
error = vfs_getattr(&path, stat, request_mask, flags);
stat->mnt_id = real_mount(path.mnt)->mnt_id;
stat->result_mask |= STATX_MNT_ID;
if (path.mnt->mnt_root == path.dentry)
stat->attributes |= STATX_ATTR_MOUNT_ROOT;
stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
/* Handle STATX_DIOALIGN for block devices. */
if (request_mask & STATX_DIOALIGN) {
struct inode *inode = d_backing_inode(path.dentry);
if (S_ISBLK(inode->i_mode))
bdev_statx_dioalign(inode, stat);
}
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
......@@ -611,6 +623,8 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
tmp.stx_dev_major = MAJOR(stat->dev);
tmp.stx_dev_minor = MINOR(stat->dev);
tmp.stx_mnt_id = stat->mnt_id;
tmp.stx_dio_mem_align = stat->dio_mem_align;
tmp.stx_dio_offset_align = stat->dio_offset_align;
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
......
......@@ -604,6 +604,16 @@ xfs_vn_getattr(
stat->blksize = BLKDEV_IOSIZE;
stat->rdev = inode->i_rdev;
break;
case S_IFREG:
if (request_mask & STATX_DIOALIGN) {
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
struct block_device *bdev = target->bt_bdev;
stat->result_mask |= STATX_DIOALIGN;
stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
stat->dio_offset_align = bdev_logical_block_size(bdev);
}
fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);
stat->rdev = 0;
......
......@@ -1498,6 +1498,7 @@ int sync_blockdev(struct block_device *bdev);
int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend);
int sync_blockdev_nowait(struct block_device *bdev);
void sync_bdevs(bool wait);
void bdev_statx_dioalign(struct inode *inode, struct kstat *stat);
void printk_all_partitions(void);
#else
static inline void invalidate_bdev(struct block_device *bdev)
......@@ -1514,6 +1515,9 @@ static inline int sync_blockdev_nowait(struct block_device *bdev)
static inline void sync_bdevs(bool wait)
{
}
static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
{
}
static inline void printk_all_partitions(void)
{
}
......
......@@ -764,7 +764,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
bool fscrypt_mergeable_bio_bh(struct bio *bio,
const struct buffer_head *next_bh);
bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter);
bool fscrypt_dio_supported(struct inode *inode);
u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks);
......@@ -797,11 +797,8 @@ static inline bool fscrypt_mergeable_bio_bh(struct bio *bio,
return true;
}
static inline bool fscrypt_dio_supported(struct kiocb *iocb,
struct iov_iter *iter)
static inline bool fscrypt_dio_supported(struct inode *inode)
{
const struct inode *inode = file_inode(iocb->ki_filp);
return !fscrypt_needs_contents_encryption(inode);
}
......
......@@ -50,6 +50,8 @@ struct kstat {
struct timespec64 btime; /* File creation time */
u64 blocks;
u64 mnt_id;
u32 dio_mem_align;
u32 dio_offset_align;
};
#endif
......@@ -124,7 +124,8 @@ struct statx {
__u32 stx_dev_minor;
/* 0x90 */
__u64 stx_mnt_id;
__u64 __spare2;
__u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */
__u32 stx_dio_offset_align; /* File offset alignment for direct I/O */
/* 0xa0 */
__u64 __spare3[12]; /* Spare space for future expansion */
/* 0x100 */
......@@ -152,6 +153,7 @@ struct statx {
#define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */
#define STATX_BTIME 0x00000800U /* Want/got stx_btime */
#define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */
#define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */
#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment