Commit 65dd2aa9 authored by Andi Kleen's avatar Andi Kleen Committed by Linus Torvalds

dio: optimize cache misses in the submission path

Some investigation of a transaction processing workload showed that a
major consumer of cycles in __blockdev_direct_IO is the cache miss while
accessing the block size.  This is because it has to walk the chain from
block_dev to gendisk to queue.

The block size is needed early on to check alignment and sizes.  It's only
done if the check for the inode block size fails.  But the costly block
device state is unconditionally fetched.

- Reorganize the code to only fetch block dev state when actually
  needed.

Then do a prefetch on the block dev early on in the direct IO path.  This
is worth it, because there is substantial code run before we actually
touch the block dev now.

- I also added some unlikelies to make it clear the compiler that block
  device fetch code is not normally executed.

This gave a small, but measurable improvement on a large database
benchmark (about 0.3%)

[akpm@linux-foundation.org: coding-style fixes]
[sfr@canb.auug.org.au: using prefetch requires including prefetch.h]
Signed-off-by: default avatarAndi Kleen <ak@linux.intel.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: default avatarStephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 87192a2a
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
#include <linux/rwsem.h> #include <linux/rwsem.h>
#include <linux/uio.h> #include <linux/uio.h>
#include <linux/atomic.h> #include <linux/atomic.h>
#include <linux/prefetch.h>
/* /*
* How many user pages to map in one call to get_user_pages(). This determines * How many user pages to map in one call to get_user_pages(). This determines
...@@ -1087,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio) ...@@ -1087,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio)
* individual fields and will generate much worse code. This is important * individual fields and will generate much worse code. This is important
* for the whole file. * for the whole file.
*/ */
ssize_t static inline ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset, struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags) dio_submit_t submit_io, int flags)
...@@ -1097,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ...@@ -1097,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
size_t size; size_t size;
unsigned long addr; unsigned long addr;
unsigned blkbits = inode->i_blkbits; unsigned blkbits = inode->i_blkbits;
unsigned bdev_blkbits = 0;
unsigned blocksize_mask = (1 << blkbits) - 1; unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL; ssize_t retval = -EINVAL;
loff_t end = offset; loff_t end = offset;
...@@ -1110,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ...@@ -1110,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (rw & WRITE) if (rw & WRITE)
rw = WRITE_ODIRECT; rw = WRITE_ODIRECT;
if (bdev) /*
bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); * Avoid references to bdev if not absolutely needed to give
* the early prefetch in the caller enough time.
*/
if (offset & blocksize_mask) { if (offset & blocksize_mask) {
if (bdev) if (bdev)
blkbits = bdev_blkbits; blkbits = blksize_bits(bdev_logical_block_size(bdev));
blocksize_mask = (1 << blkbits) - 1; blocksize_mask = (1 << blkbits) - 1;
if (offset & blocksize_mask) if (offset & blocksize_mask)
goto out; goto out;
...@@ -1126,9 +1128,11 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ...@@ -1126,9 +1128,11 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
addr = (unsigned long)iov[seg].iov_base; addr = (unsigned long)iov[seg].iov_base;
size = iov[seg].iov_len; size = iov[seg].iov_len;
end += size; end += size;
if ((addr & blocksize_mask) || (size & blocksize_mask)) { if (unlikely((addr & blocksize_mask) ||
(size & blocksize_mask))) {
if (bdev) if (bdev)
blkbits = bdev_blkbits; blkbits = blksize_bits(
bdev_logical_block_size(bdev));
blocksize_mask = (1 << blkbits) - 1; blocksize_mask = (1 << blkbits) - 1;
if ((addr & blocksize_mask) || (size & blocksize_mask)) if ((addr & blocksize_mask) || (size & blocksize_mask))
goto out; goto out;
...@@ -1313,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ...@@ -1313,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
out: out:
return retval; return retval;
} }
ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags)
{
/*
* The block device state is needed in the end to finally
* submit everything. Since it's likely to be cache cold
* prefetch it here as first thing to hide some of the
* latency.
*
* Attempt to prefetch the pieces we likely need later.
*/
prefetch(&bdev->bd_disk->part_tbl);
prefetch(bdev->bd_queue);
prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
nr_segs, get_block, end_io,
submit_io, flags);
}
EXPORT_SYMBOL(__blockdev_direct_IO); EXPORT_SYMBOL(__blockdev_direct_IO);
static __init int dio_init(void) static __init int dio_init(void)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment