[PATCH] rework readahead for congested queues

Since Jens changed the block layer to fail readahead if the queue has no requests free, a few changes suggest themselves. - It's a bit silly to go and alocate a bunch of pages, build BIOs for them, submit the IO only to have it fail, forcing us to free the pages again. So the patch changes do_page_cache_readahead() to peek at the queue's read_congested state. If the queue is read-congested we abandon the entire readahead up-front without doing all that work. - If the queue is not read-congested, we go ahead and do the readahead, after having set PF_READAHEAD. The backing_dev_info's read-congested threshold cuts in when 7/8ths of the queue's requests are in flight, so it is probable that the readahead abandonment code in __make_request will now almost never trigger. - The above changes make do_page_cache_readahead() "unreliable", in that it may do nothing at all. However there are some system calls: - fadvise(POSIX_FADV_WILLNEED) - madvise(MADV_WILLNEED) - sys_readahead() In which the user has an expectation that the kernel will actually perform the IO. So the patch creates a new "force_page_cache_readahead()" which will perform the IO regardless of the queue's congestion state. Arguably, this is the wrong thing to do: even though the application requested readahead it could be that the kernel _should_ abandon the user's request because the disk is so busy. I don't know. But for now, let's keep the above syscalls behaviour unchanged. It is trivial to switch back to do_page_cache_readahead() later.

[PATCH] rework readahead for congested queues
Since Jens changed the block layer to fail readahead if the queue has no requests free, a few changes suggest themselves. - It's a bit silly to go and alocate a bunch of pages, build BIOs for them, submit the IO only to have it fail, forcing us to free the pages again. So the patch changes do_page_cache_readahead() to peek at the queue's read_congested state. If the queue is read-congested we abandon the entire readahead up-front without doing all that work. - If the queue is not read-congested, we go ahead and do the readahead, after having set PF_READAHEAD. The backing_dev_info's read-congested threshold cuts in when 7/8ths of the queue's requests are in flight, so it is probable that the readahead abandonment code in __make_request will now almost never trigger. - The above changes make do_page_cache_readahead() "unreliable", in that it may do nothing at all. However there are some system calls: - fadvise(POSIX_FADV_WILLNEED) - madvise(MADV_WILLNEED) - sys_readahead() In which the user has an expectation that the kernel will actually perform the IO. So the patch creates a new "force_page_cache_readahead()" which will perform the IO regardless of the queue's congestion state. Arguably, this is the wrong thing to do: even though the application requested readahead it could be that the kernel _should_ abandon the user's request because the disk is so busy. I don't know. But for now, let's keep the above syscalls behaviour unchanged. It is trivial to switch back to do_page_cache_readahead() later.
12affe8f · Andrew Morton · Linus Torvalds · d49ceaba · 12affe8f · 12affe8f
Commit 12affe8f authored Jul 31, 2003 by Andrew Morton Committed by Linus Torvalds Jul 31, 2003
Showing with 32 additions and 10 deletions

include/linux/mm.h include/linux/mm.h +2 -0

mm/fadvise.c mm/fadvise.c +1 -1

mm/filemap.c mm/filemap.c +5 -3

mm/madvise.c mm/madvise.c +1 -1

mm/readahead.c mm/readahead.c +23 -5

No files found.
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -570,6 +570,8 @@ int write_one_page(struct page *page, int wait);
 int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 			unsigned long offset, unsigned long nr_to_read);
+int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
+			unsigned long offset, unsigned long nr_to_read);
 void page_cache_readahead(struct address_space *mapping, 
 			  struct file_ra_state *ra,
 			  struct file *filp,

--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -56,7 +56,7 @@ long sys_fadvise64(int fd, loff_t offset, size_t len, int advice)
 			ret = -EINVAL;
 			break;
 		}
-		ret = do_page_cache_readahead(mapping, file,
+		ret = force_page_cache_readahead(mapping, file,
 				offset >> PAGE_CACHE_SHIFT,
 				max_sane_readahead(len >> PAGE_CACHE_SHIFT));
 		if (ret > 0)

--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -870,7 +870,8 @@ do_readahead(struct address_space *mapping, struct file *filp,
 	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
 		return -EINVAL;
-	do_page_cache_readahead(mapping, filp, index, max_sane_readahead(nr));
+	force_page_cache_readahead(mapping, filp, index,
+					max_sane_readahead(nr));
 	return 0;
 }
@@ -996,7 +997,8 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address
 			goto no_cached_page;
 		did_readaround = 1;
-		do_page_cache_readahead(mapping, file, pgoff & ~(MMAP_READAROUND-1), MMAP_READAROUND);
+		do_page_cache_readahead(mapping, file,
+				pgoff & ~(MMAP_READAROUND-1), MMAP_READAROUND);
 		goto retry_find;
 	}
@@ -1230,7 +1232,7 @@ static int filemap_populate(struct vm_area_struct *vma,
 	int err;
 	if (!nonblock)
-		do_page_cache_readahead(mapping, vma->vm_file,
+		force_page_cache_readahead(mapping, vma->vm_file,
 					pgoff, len >> PAGE_CACHE_SHIFT);
 repeat:

--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -65,7 +65,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
 		end = vma->vm_end;
 	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-	do_page_cache_readahead(file->f_dentry->d_inode->i_mapping,
+	force_page_cache_readahead(file->f_dentry->d_inode->i_mapping,
 			file, start, max_sane_readahead(end - start));
 	return 0;
 }

--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -96,8 +96,6 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 	struct pagevec lru_pvec;
 	int ret = 0;
-	current->flags |= PF_READAHEAD;
 	if (mapping->a_ops->readpages) {
 		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
 		goto out;
@@ -118,7 +116,6 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 	}
 	pagevec_lru_add(&lru_pvec);
 out:
-	current->flags &= ~PF_READAHEAD;
 	return ret;
 }
@@ -263,8 +260,8 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 * Chunk the readahead into 2 megabyte units, so that we don't pin too much
 * memory at once.
 */
-int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
-			unsigned long offset, unsigned long nr_to_read)
+		unsigned long offset, unsigned long nr_to_read)
 {
 	int ret = 0;
@@ -291,6 +288,27 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	return ret;
 }
+/*
+ * This version skips the IO if the queue is read-congested, and will tell the
+ * block layer to abandon the readahead if request allocation would block.
+ *
+ * force_page_cache_readahead() will ignore queue congestion and will block on
+ * request queues.
+ */
+int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+			unsigned long offset, unsigned long nr_to_read)
+{
+	int ret = 0;
+	if (!bdi_read_congested(mapping->backing_dev_info)) {
+		current->flags |= PF_READAHEAD;
+		ret = __do_page_cache_readahead(mapping, filp,
+						offset, nr_to_read);
+		current->flags &= ~PF_READAHEAD;
+	}
+	return ret;
+}
 /*
 * Check how effective readahead is being.  If the amount of started IO is
 * less than expected then the file is partly or fully in pagecache and