Commit 8fa49846 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] readahead

I'd like to be able to claim amazing speedups, but
the best benchmark I could find was diffing two
256 megabyte files, which is about 10% quicker.  And
that is probably due to the window size being effectively
50% larger.

Fact is, any disk worth owning nowadays has a segmented
2-megabyte cache, and OS-level readahead mainly seems
to save on CPU cycles rather than overall throughput.
Once you start reading more streams than there are segments
in the disk cache we start to win.

Still.  The main motivation for this work is to
clean the code up, and to create a central point at
which many pages are marshalled together so that
they can all be encapsulated into the smallest possible
number of BIOs, and injected into the request layer.

A number of filesystems were poking around inside the
readahead state variables.  I'm not really sure what they
were up to, but I took all that out.  The readahead
code manages its own state autonomously and should not
need any hints.

- Unifies the current three readahead functions (mmap reads, read(2)
  and sys_readhead) into a single implementation.

- More aggressive in building up the readahead windows.

- More conservative in tearing them down.

- Special start-of-file heuristics.

- Preallocates the readahead pages, to avoid the (never demonstrated,
  but potentially catastrophic) scenario where allocation of readahead
  pages causes the allocator to perform VM writeout.

- Gets all the readahead pages gathered together in
  one spot, so they can be marshalled into big BIOs.

- reinstates the readahead ioctls, so hdparm(8) and blockdev(8)
  are working again.  The readahead settings are now per-request-queue,
  and the drivers never have to know about it.  I use blockdev(8).
  It works in units of 512 bytes.

- Identifies readahead thrashing.

  Also attempts to handle it.  Certainly the changes here
  delay the onset of catastrophic readahead thrashing by
  quite a lot, and decrease it seriousness as we get more
  deeply into it, but it's still pretty bad.
parent 3d30a6cc
......@@ -237,6 +237,18 @@ int blk_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg)
intval = (is_read_only(dev) != 0);
return put_user(intval, (int *)(arg));
case BLKRASET:
case BLKFRASET:
if(!capable(CAP_SYS_ADMIN))
return -EACCES;
return blk_set_readahead(dev, arg);
case BLKRAGET:
case BLKFRAGET:
if (!arg)
return -EINVAL;
return put_user(blk_get_readahead(dev), (long *)arg);
case BLKSECTGET:
if ((q = blk_get_queue(dev)) == NULL)
return -EINVAL;
......
......@@ -108,6 +108,47 @@ inline request_queue_t *blk_get_queue(kdev_t dev)
return &blk_dev[major(dev)].request_queue;
}
/**
* blk_set_readahead - set a queue's readahead tunable
* @dev: device
* @sectors: readahead, in 512 byte sectors
*
* Returns zero on success, else negative errno
*/
int blk_set_readahead(kdev_t dev, unsigned sectors)
{
int ret = -EINVAL;
request_queue_t *q = blk_get_queue(dev);
if (q) {
q->ra_sectors = sectors;
ret = 0;
}
return ret;
}
/**
* blk_get_readahead - query a queue's readahead tunable
* @dev: device
*
* Locates the passed device's request queue and returns its
* readahead setting.
*
* The returned value is in units of 512 byte sectors.
*
* Will return zero if the queue has never had its readahead
* setting altered.
*/
unsigned blk_get_readahead(kdev_t dev)
{
unsigned ret = 0;
request_queue_t *q = blk_get_queue(dev);
if (q)
ret = q->ra_sectors;
return ret;
}
void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn)
{
q->prep_rq_fn = pfn;
......@@ -810,7 +851,8 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock)
q->plug_tq.data = q;
q->queue_flags = (1 << QUEUE_FLAG_CLUSTER);
q->queue_lock = lock;
q->ra_sectors = 0; /* Use VM default */
blk_queue_segment_boundary(q, 0xffffffff);
blk_queue_make_request(q, __make_request);
......
......@@ -1577,7 +1577,7 @@ static int device_size_calculation(mddev_t * mddev)
if (!md_size[mdidx(mddev)])
md_size[mdidx(mddev)] = sb->size * data_disks;
readahead = MD_READAHEAD;
readahead = (blk_get_readahead(rdev->dev) * 512) / PAGE_SIZE;
if (!sb->level || (sb->level == 4) || (sb->level == 5)) {
readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
......@@ -3387,7 +3387,7 @@ int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
/*
* Tune reconstruction:
*/
window = MAX_READAHEAD*(PAGE_SIZE/512);
window = 32*(PAGE_SIZE/512);
printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
window/2,max_sectors/2);
......@@ -3605,7 +3605,7 @@ static void md_geninit(void)
for(i = 0; i < MAX_MD_DEVS; i++) {
md_blocksizes[i] = 1024;
md_size[i] = 0;
md_maxreadahead[i] = MD_READAHEAD;
md_maxreadahead[i] = 32;
}
blksize_size[MAJOR_NR] = md_blocksizes;
blk_size[MAJOR_NR] = md_size;
......
......@@ -19,6 +19,7 @@
#include <linux/highmem.h>
#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
#include <asm/uaccess.h>
......@@ -172,7 +173,6 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
if (offset >= 0 && offset <= size) {
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_reada = 0;
file->f_version = ++event;
}
retval = offset;
......@@ -692,9 +692,20 @@ int blkdev_close(struct inode * inode, struct file * filp)
static int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
unsigned long arg)
{
if (inode->i_bdev->bd_op->ioctl)
return inode->i_bdev->bd_op->ioctl(inode, file, cmd, arg);
return -EINVAL;
int ret = -EINVAL;
switch (cmd) {
case BLKRAGET:
case BLKFRAGET:
case BLKRASET:
case BLKFRASET:
ret = blk_ioctl(inode->i_bdev, cmd, arg);
break;
default:
if (inode->i_bdev->bd_op->ioctl)
ret =inode->i_bdev->bd_op->ioctl(inode, file, cmd, arg);
break;
}
return ret;
}
struct address_space_operations def_blk_aops = {
......
......@@ -166,7 +166,6 @@ static hfs_rwret_t hfs_file_read(struct file * filp, char * buf,
}
if ((read = hfs_do_read(inode, HFS_I(inode)->fork, pos, buf, left)) > 0) {
*ppos += read;
filp->f_reada = 1;
}
return read;
......
......@@ -105,7 +105,6 @@ static loff_t cap_info_llseek(struct file *file, loff_t offset, int origin)
if (offset>=0 && offset<=HFS_FORK_MAX) {
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_reada = 0;
}
retval = offset;
}
......
......@@ -361,7 +361,6 @@ loff_t hdr_llseek(struct file *file, loff_t offset, int origin)
if (offset>=0 && offset<file->f_dentry->d_inode->i_size) {
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_reada = 0;
}
retval = offset;
}
......@@ -594,7 +593,6 @@ static hfs_rwret_t hdr_read(struct file * filp, char * buf,
} else if (fork) {
left = hfs_do_read(inode, fork, offset, buf, left);
if (left > 0) {
filp->f_reada = 1;
} else if (!read) {
return left;
} else {
......
......@@ -1884,7 +1884,6 @@ static struct file *presto_filp_dopen(struct dentry *dentry, int flags)
f->f_dentry = dentry;
f->f_pos = 0;
f->f_reada = 0;
f->f_op = NULL;
if (inode->i_op)
/* XXX should we set to presto ops, or leave at cache ops? */
......
......@@ -67,11 +67,7 @@ struct raparms {
unsigned int p_count;
ino_t p_ino;
kdev_t p_dev;
unsigned long p_reada,
p_ramax,
p_raend,
p_ralen,
p_rawin;
struct file_ra_state p_ra;
};
static struct raparms * raparml;
......@@ -564,11 +560,7 @@ nfsd_get_raparms(kdev_t dev, ino_t ino)
ra = *frap;
ra->p_dev = dev;
ra->p_ino = ino;
ra->p_reada = 0;
ra->p_ramax = 0;
ra->p_raend = 0;
ra->p_ralen = 0;
ra->p_rawin = 0;
memset(&ra->p_ra, 0, sizeof(ra->p_ra));
found:
if (rap != &raparm_cache) {
*rap = ra->p_next;
......@@ -611,31 +603,18 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
/* Get readahead parameters */
ra = nfsd_get_raparms(inode->i_dev, inode->i_ino);
if (ra) {
file.f_reada = ra->p_reada;
file.f_ramax = ra->p_ramax;
file.f_raend = ra->p_raend;
file.f_ralen = ra->p_ralen;
file.f_rawin = ra->p_rawin;
}
if (ra)
file.f_ra = ra->p_ra;
file.f_pos = offset;
oldfs = get_fs(); set_fs(KERNEL_DS);
oldfs = get_fs();
set_fs(KERNEL_DS);
err = file.f_op->read(&file, buf, *count, &file.f_pos);
set_fs(oldfs);
/* Write back readahead params */
if (ra != NULL) {
dprintk("nfsd: raparms %ld %ld %ld %ld %ld\n",
file.f_reada, file.f_ramax, file.f_raend,
file.f_ralen, file.f_rawin);
ra->p_reada = file.f_reada;
ra->p_ramax = file.f_ramax;
ra->p_raend = file.f_raend;
ra->p_ralen = file.f_ralen;
ra->p_rawin = file.f_rawin;
ra->p_count -= 1;
}
if (ra)
ra->p_ra = file.f_ra;
if (err >= 0) {
nfsdstats.io_read += err;
......
......@@ -635,7 +635,6 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
f->f_dentry = dentry;
f->f_vfsmnt = mnt;
f->f_pos = 0;
f->f_reada = 0;
f->f_op = fops_get(inode->i_fop);
file_move(f, &inode->i_sb->s_files);
......
......@@ -37,7 +37,6 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_reada = 0;
file->f_version = ++event;
}
retval = offset;
......@@ -62,7 +61,6 @@ loff_t remote_llseek(struct file *file, loff_t offset, int origin)
if (offset>=0 && offset<=file->f_dentry->d_inode->i_sb->s_maxbytes) {
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_reada = 0;
file->f_version = ++event;
}
retval = offset;
......@@ -92,7 +90,6 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
if (offset >= 0) {
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_reada = 0;
file->f_version = ++event;
}
retval = offset;
......
......@@ -152,6 +152,12 @@ struct request_queue
make_request_fn *make_request_fn;
prep_rq_fn *prep_rq_fn;
/*
* The VM-level readahead tunable for this device. In
* units of 512-byte sectors.
*/
unsigned ra_sectors;
/*
* The queue owner gets to use this for whatever they like.
* ll_rw_blk doesn't touch it.
......@@ -308,6 +314,8 @@ extern void blk_queue_hardsect_size(request_queue_t *q, unsigned short);
extern void blk_queue_segment_boundary(request_queue_t *q, unsigned long);
extern void blk_queue_assign_lock(request_queue_t *q, spinlock_t *);
extern void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn);
extern int blk_set_readahead(kdev_t dev, unsigned sectors);
extern unsigned blk_get_readahead(kdev_t dev);
extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
extern void blk_dump_rq_flags(struct request *, char *);
......@@ -322,10 +330,6 @@ extern int * blksize_size[MAX_BLKDEV];
#define MAX_SEGMENT_SIZE 65536
/* read-ahead in pages.. */
#define MAX_READAHEAD 31
#define MIN_READAHEAD 3
#define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
extern void drive_stat_acct(struct request *, int, int);
......
......@@ -173,12 +173,10 @@ extern int leases_enable, dir_notify_enable, lease_break_time;
#define BLKRRPART _IO(0x12,95) /* re-read partition table */
#define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */
#define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */
#if 0 /* Obsolete, these don't do anything. */
#define BLKRASET _IO(0x12,98) /* set read ahead for block device */
#define BLKRAGET _IO(0x12,99) /* get current read ahead setting */
#define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */
#define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */
#endif
#define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */
#define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */
#define BLKSSZGET _IO(0x12,104)/* get block device sector size */
......@@ -487,6 +485,18 @@ struct fown_struct {
int signum; /* posix.1b rt signal to be delivered on IO */
};
/*
* Track a single file's readahead state
*/
struct file_ra_state {
unsigned long start; /* Current window */
unsigned long size;
unsigned long next_size; /* Next window size */
unsigned long prev_page; /* Cache last read() position */
unsigned long ahead_start; /* Ahead window */
unsigned long ahead_size;
};
struct file {
struct list_head f_list;
struct dentry *f_dentry;
......@@ -496,10 +506,10 @@ struct file {
unsigned int f_flags;
mode_t f_mode;
loff_t f_pos;
unsigned long f_reada, f_ramax, f_raend, f_ralen, f_rawin;
struct fown_struct f_owner;
unsigned int f_uid, f_gid;
int f_error;
struct file_ra_state f_ra;
unsigned long f_version;
......
......@@ -531,6 +531,13 @@ extern void truncate_inode_pages(struct address_space *, loff_t);
extern int filemap_sync(struct vm_area_struct *, unsigned long, size_t, unsigned int);
extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int);
/* readahead.c */
void do_page_cache_readahead(struct file *file,
unsigned long offset, unsigned long nr_to_read);
void page_cache_readahead(struct file *file, unsigned long offset);
void page_cache_readaround(struct file *file, unsigned long offset);
void handle_ra_thrashing(struct file *file);
/* vma is the first one with address < vma->vm_end,
* and even address < vma->vm_start. Have to extend vma. */
static inline int expand_stack(struct vm_area_struct * vma, unsigned long address)
......
......@@ -91,7 +91,6 @@ static inline mddev_t * kdev_to_mddev (kdev_t dev)
/*
* default readahead
*/
#define MD_READAHEAD MAX_READAHEAD
static inline int disk_faulty(mdp_disk_t * d)
{
......
......@@ -14,6 +14,6 @@ export-objs := shmem.o filemap.o mempool.o page_alloc.o
obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
shmem.o highmem.o mempool.o msync.o mincore.o
shmem.o highmem.o mempool.o msync.o mincore.o readahead.o
include $(TOPDIR)/Rules.make
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment