Commit 12affe8f authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] rework readahead for congested queues

Since Jens changed the block layer to fail readahead if the queue has no
requests free, a few changes suggest themselves.

- It's a bit silly to go and alocate a bunch of pages, build BIOs for them,
  submit the IO only to have it fail, forcing us to free the pages again.

  So the patch changes do_page_cache_readahead() to peek at the queue's
  read_congested state.  If the queue is read-congested we abandon the entire
  readahead up-front without doing all that work.

- If the queue is not read-congested, we go ahead and do the readahead,
  after having set PF_READAHEAD.

  The backing_dev_info's read-congested threshold cuts in when 7/8ths of
  the queue's requests are in flight, so it is probable that the readahead
  abandonment code in __make_request will now almost never trigger.

- The above changes make do_page_cache_readahead() "unreliable", in that it
  may do nothing at all.

  However there are some system calls:

	- fadvise(POSIX_FADV_WILLNEED)
	- madvise(MADV_WILLNEED)
	- sys_readahead()

  In which the user has an expectation that the kernel will actually
  perform the IO.

  So the patch creates a new "force_page_cache_readahead()" which will
  perform the IO regardless of the queue's congestion state.

  Arguably, this is the wrong thing to do: even though the application
  requested readahead it could be that the kernel _should_ abandon the user's
  request because the disk is so busy.

  I don't know.  But for now, let's keep the above syscalls behaviour
  unchanged.  It is trivial to switch back to do_page_cache_readahead()
  later.
parent d49ceaba
...@@ -570,6 +570,8 @@ int write_one_page(struct page *page, int wait); ...@@ -570,6 +570,8 @@ int write_one_page(struct page *page, int wait);
int do_page_cache_readahead(struct address_space *mapping, struct file *filp, int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
unsigned long offset, unsigned long nr_to_read); unsigned long offset, unsigned long nr_to_read);
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
unsigned long offset, unsigned long nr_to_read);
void page_cache_readahead(struct address_space *mapping, void page_cache_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file_ra_state *ra,
struct file *filp, struct file *filp,
......
...@@ -56,7 +56,7 @@ long sys_fadvise64(int fd, loff_t offset, size_t len, int advice) ...@@ -56,7 +56,7 @@ long sys_fadvise64(int fd, loff_t offset, size_t len, int advice)
ret = -EINVAL; ret = -EINVAL;
break; break;
} }
ret = do_page_cache_readahead(mapping, file, ret = force_page_cache_readahead(mapping, file,
offset >> PAGE_CACHE_SHIFT, offset >> PAGE_CACHE_SHIFT,
max_sane_readahead(len >> PAGE_CACHE_SHIFT)); max_sane_readahead(len >> PAGE_CACHE_SHIFT));
if (ret > 0) if (ret > 0)
......
...@@ -870,7 +870,8 @@ do_readahead(struct address_space *mapping, struct file *filp, ...@@ -870,7 +870,8 @@ do_readahead(struct address_space *mapping, struct file *filp,
if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
return -EINVAL; return -EINVAL;
do_page_cache_readahead(mapping, filp, index, max_sane_readahead(nr)); force_page_cache_readahead(mapping, filp, index,
max_sane_readahead(nr));
return 0; return 0;
} }
...@@ -996,7 +997,8 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address ...@@ -996,7 +997,8 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address
goto no_cached_page; goto no_cached_page;
did_readaround = 1; did_readaround = 1;
do_page_cache_readahead(mapping, file, pgoff & ~(MMAP_READAROUND-1), MMAP_READAROUND); do_page_cache_readahead(mapping, file,
pgoff & ~(MMAP_READAROUND-1), MMAP_READAROUND);
goto retry_find; goto retry_find;
} }
...@@ -1230,7 +1232,7 @@ static int filemap_populate(struct vm_area_struct *vma, ...@@ -1230,7 +1232,7 @@ static int filemap_populate(struct vm_area_struct *vma,
int err; int err;
if (!nonblock) if (!nonblock)
do_page_cache_readahead(mapping, vma->vm_file, force_page_cache_readahead(mapping, vma->vm_file,
pgoff, len >> PAGE_CACHE_SHIFT); pgoff, len >> PAGE_CACHE_SHIFT);
repeat: repeat:
......
...@@ -65,7 +65,7 @@ static long madvise_willneed(struct vm_area_struct * vma, ...@@ -65,7 +65,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
end = vma->vm_end; end = vma->vm_end;
end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
do_page_cache_readahead(file->f_dentry->d_inode->i_mapping, force_page_cache_readahead(file->f_dentry->d_inode->i_mapping,
file, start, max_sane_readahead(end - start)); file, start, max_sane_readahead(end - start));
return 0; return 0;
} }
......
...@@ -96,8 +96,6 @@ static int read_pages(struct address_space *mapping, struct file *filp, ...@@ -96,8 +96,6 @@ static int read_pages(struct address_space *mapping, struct file *filp,
struct pagevec lru_pvec; struct pagevec lru_pvec;
int ret = 0; int ret = 0;
current->flags |= PF_READAHEAD;
if (mapping->a_ops->readpages) { if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
goto out; goto out;
...@@ -118,7 +116,6 @@ static int read_pages(struct address_space *mapping, struct file *filp, ...@@ -118,7 +116,6 @@ static int read_pages(struct address_space *mapping, struct file *filp,
} }
pagevec_lru_add(&lru_pvec); pagevec_lru_add(&lru_pvec);
out: out:
current->flags &= ~PF_READAHEAD;
return ret; return ret;
} }
...@@ -263,7 +260,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, ...@@ -263,7 +260,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
* Chunk the readahead into 2 megabyte units, so that we don't pin too much * Chunk the readahead into 2 megabyte units, so that we don't pin too much
* memory at once. * memory at once.
*/ */
int do_page_cache_readahead(struct address_space *mapping, struct file *filp, int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
unsigned long offset, unsigned long nr_to_read) unsigned long offset, unsigned long nr_to_read)
{ {
int ret = 0; int ret = 0;
...@@ -291,6 +288,27 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, ...@@ -291,6 +288,27 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
return ret; return ret;
} }
/*
* This version skips the IO if the queue is read-congested, and will tell the
* block layer to abandon the readahead if request allocation would block.
*
* force_page_cache_readahead() will ignore queue congestion and will block on
* request queues.
*/
int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
unsigned long offset, unsigned long nr_to_read)
{
int ret = 0;
if (!bdi_read_congested(mapping->backing_dev_info)) {
current->flags |= PF_READAHEAD;
ret = __do_page_cache_readahead(mapping, filp,
offset, nr_to_read);
current->flags &= ~PF_READAHEAD;
}
return ret;
}
/* /*
* Check how effective readahead is being. If the amount of started IO is * Check how effective readahead is being. If the amount of started IO is
* less than expected then the file is partly or fully in pagecache and * less than expected then the file is partly or fully in pagecache and
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment