Commit bc858911 authored by Andrew Morton's avatar Andrew Morton Committed by James Bottomley

[PATCH] Allow VFS readahead to fall to zero

Some workloads really, really want to have no readahead.  Databases which are
perfoming small synchronous I/Os against a file which has extremely poor
layout.  Any readahead at all is a lose here.

But the current readahead code refuses to adapt that low.

Fix it up so that we can indeed adaptively disable readahead altogether, and
do not start it again until we have seen max_readahead()'s worth of
consecutive reads.
parent 731cf67c
...@@ -558,6 +558,7 @@ int write_one_page(struct page *page, int wait); ...@@ -558,6 +558,7 @@ int write_one_page(struct page *page, int wait);
/* readahead.c */ /* readahead.c */
#define VM_MAX_READAHEAD 128 /* kbytes */ #define VM_MAX_READAHEAD 128 /* kbytes */
#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */
int do_page_cache_readahead(struct address_space *mapping, struct file *filp, int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
unsigned long offset, unsigned long nr_to_read); unsigned long offset, unsigned long nr_to_read);
void page_cache_readahead(struct address_space *mapping, void page_cache_readahead(struct address_space *mapping,
...@@ -569,7 +570,7 @@ void page_cache_readaround(struct address_space *mapping, ...@@ -569,7 +570,7 @@ void page_cache_readaround(struct address_space *mapping,
struct file *filp, struct file *filp,
unsigned long offset); unsigned long offset);
void handle_ra_miss(struct address_space *mapping, void handle_ra_miss(struct address_space *mapping,
struct file_ra_state *ra); struct file_ra_state *ra, pgoff_t offset);
unsigned long max_sane_readahead(unsigned long nr); unsigned long max_sane_readahead(unsigned long nr);
/* Do stack extension */ /* Do stack extension */
......
...@@ -562,7 +562,7 @@ void do_generic_mapping_read(struct address_space *mapping, ...@@ -562,7 +562,7 @@ void do_generic_mapping_read(struct address_space *mapping,
find_page: find_page:
page = find_get_page(mapping, index); page = find_get_page(mapping, index);
if (unlikely(page == NULL)) { if (unlikely(page == NULL)) {
handle_ra_miss(mapping, ra); handle_ra_miss(mapping, ra, index);
goto no_cached_page; goto no_cached_page;
} }
if (!PageUptodate(page)) if (!PageUptodate(page))
...@@ -978,7 +978,7 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address ...@@ -978,7 +978,7 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address
page = find_get_page(mapping, pgoff); page = find_get_page(mapping, pgoff);
if (!page) { if (!page) {
if (did_readahead) { if (did_readahead) {
handle_ra_miss(mapping,ra); handle_ra_miss(mapping, ra, pgoff);
did_readahead = 0; did_readahead = 0;
} }
goto no_cached_page; goto no_cached_page;
......
...@@ -136,6 +136,12 @@ static int read_pages(struct address_space *mapping, struct file *filp, ...@@ -136,6 +136,12 @@ static int read_pages(struct address_space *mapping, struct file *filp,
* ahead_size: Together, these form the "ahead window". * ahead_size: Together, these form the "ahead window".
* ra_pages: The externally controlled max readahead for this fd. * ra_pages: The externally controlled max readahead for this fd.
* *
* When readahead is in the "maximally shrunk" state (next_size == -1UL),
* readahead is disabled. In this state, prev_page and size are used, inside
* handle_ra_miss(), to detect the resumption of sequential I/O. Once there
* has been a decent run of sequential I/O (defined by get_min_readahead),
* readahead is reenabled.
*
* The readahead code manages two windows - the "current" and the "ahead" * The readahead code manages two windows - the "current" and the "ahead"
* windows. The intent is that while the application is walking the pages * windows. The intent is that while the application is walking the pages
* in the current window, I/O is underway on the ahead window. When the * in the current window, I/O is underway on the ahead window. When the
...@@ -168,6 +174,8 @@ static int read_pages(struct address_space *mapping, struct file *filp, ...@@ -168,6 +174,8 @@ static int read_pages(struct address_space *mapping, struct file *filp,
* will continue to perform linear reads. Either at the new file position, or * will continue to perform linear reads. Either at the new file position, or
* at the old one after another seek. * at the old one after another seek.
* *
* After enough misses, readahead is fully disabled. (next_size = -1UL).
*
* There is a special-case: if the first page which the application tries to * There is a special-case: if the first page which the application tries to
* read happens to be the first page of the file, it is assumed that a linear * read happens to be the first page of the file, it is assumed that a linear
* read is about to happen and the window is immediately set to half of the * read is about to happen and the window is immediately set to half of the
...@@ -253,14 +261,19 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, ...@@ -253,14 +261,19 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
int ret = 0; int ret = 0;
while (nr_to_read) { while (nr_to_read) {
int err;
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
if (this_chunk > nr_to_read) if (this_chunk > nr_to_read)
this_chunk = nr_to_read; this_chunk = nr_to_read;
ret = __do_page_cache_readahead(mapping, filp, err = __do_page_cache_readahead(mapping, filp,
offset, this_chunk); offset, this_chunk);
if (ret < 0) if (err < 0) {
ret = err;
break; break;
}
ret += err;
offset += this_chunk; offset += this_chunk;
nr_to_read -= this_chunk; nr_to_read -= this_chunk;
} }
...@@ -286,6 +299,7 @@ check_ra_success(struct file_ra_state *ra, pgoff_t attempt, ...@@ -286,6 +299,7 @@ check_ra_success(struct file_ra_state *ra, pgoff_t attempt,
ra->ahead_size = ra->next_size; ra->ahead_size = ra->next_size;
} else { } else {
ra->next_size = -1UL; ra->next_size = -1UL;
ra->size = 0;
} }
} }
} }
...@@ -345,16 +359,19 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, ...@@ -345,16 +359,19 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
ra->next_size += 2; ra->next_size += 2;
} else { } else {
/* /*
* A miss - lseek, pread, etc. Shrink the readahead * A miss - lseek, pagefault, pread, etc. Shrink the readahead
* window by 25%. * window by 25%.
*/ */
ra->next_size -= ra->next_size / 4; ra->next_size -= ra->next_size / 4 + 2;
} }
if (ra->next_size > max) if ((long)ra->next_size > (long)max)
ra->next_size = max; ra->next_size = max;
if (ra->next_size < min) if ((long)ra->next_size <= 0L) {
ra->next_size = min; ra->next_size = -1UL;
ra->size = 0;
goto out; /* Readahead is off */
}
/* /*
* Is this request outside the current window? * Is this request outside the current window?
...@@ -374,6 +391,7 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, ...@@ -374,6 +391,7 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
ra->prev_page = ra->start; ra->prev_page = ra->start;
ra->ahead_start = 0; ra->ahead_start = 0;
ra->ahead_size = 0; ra->ahead_size = 0;
/* /*
* Control now returns, probably to sleep until I/O * Control now returns, probably to sleep until I/O
* completes against the first ahead page. * completes against the first ahead page.
...@@ -394,7 +412,6 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, ...@@ -394,7 +412,6 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
ra->size = ra->next_size; ra->size = ra->next_size;
ra->ahead_start = 0; /* Invalidate these */ ra->ahead_start = 0; /* Invalidate these */
ra->ahead_size = 0; ra->ahead_size = 0;
actual = do_page_cache_readahead(mapping, filp, offset, actual = do_page_cache_readahead(mapping, filp, offset,
ra->size); ra->size);
check_ra_success(ra, ra->size, actual, orig_next_size); check_ra_success(ra, ra->size, actual, orig_next_size);
...@@ -455,21 +472,37 @@ page_cache_readaround(struct address_space *mapping, struct file_ra_state *ra, ...@@ -455,21 +472,37 @@ page_cache_readaround(struct address_space *mapping, struct file_ra_state *ra,
* not found. This will happen if it was evicted by the VM (readahead * not found. This will happen if it was evicted by the VM (readahead
* thrashing) or if the readahead window is maximally shrunk. * thrashing) or if the readahead window is maximally shrunk.
* *
* If the window has been maximally shrunk (next_size == 0) then bump it up * If the window has been maximally shrunk (next_size == -1UL) then look to see
* again to resume readahead. * if we are getting misses against sequential file offsets. If so, and this
* persists then resume readahead.
* *
* Otherwise we're thrashing, so shrink the readahead window by three pages. * Otherwise we're thrashing, so shrink the readahead window by three pages.
* This is because it is grown by two pages on a readahead hit. Theory being * This is because it is grown by two pages on a readahead hit. Theory being
* that the readahead window size will stabilise around the maximum level at * that the readahead window size will stabilise around the maximum level at
* which there is no thrashing. * which there is no thrashing.
*/ */
void handle_ra_miss(struct address_space *mapping, struct file_ra_state *ra) void handle_ra_miss(struct address_space *mapping,
struct file_ra_state *ra, pgoff_t offset)
{ {
const unsigned long min = get_min_readahead(ra);
if (ra->next_size == -1UL) { if (ra->next_size == -1UL) {
ra->next_size = min; const unsigned long max = get_max_readahead(ra);
if (offset != ra->prev_page + 1) {
ra->size = 0; /* Not sequential */
} else { } else {
ra->size++; /* A sequential read */
if (ra->size >= max) { /* Resume readahead */
ra->start = offset - max;
ra->next_size = max;
ra->size = max;
ra->ahead_start = 0;
ra->ahead_size = 0;
}
}
ra->prev_page = offset;
} else {
const unsigned long min = get_min_readahead(ra);
ra->next_size -= 3; ra->next_size -= 3;
if (ra->next_size < min) if (ra->next_size < min)
ra->next_size = min; ra->next_size = min;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment