[PATCH] use the congestion APIs in pdflush

The key concept here is that pdflush does not block on request queues any more. Instead, it circulates across the queues, keeping any non-congested queues full of write data. When all queues are full, pdflush takes a nap, to be woken when *any* queue exits write congestion. This code can keep sixty spindles saturated - we've never been able to do that before. - Add the `nonblocking' flag to struct writeback_control, and teach the writeback paths to honour it. - Add the `encountered_congestion' flag to struct writeback_control and teach the writeback paths to set it. So as soon as a mapping's backing_dev_info indicates that it is getting congested, bale out of writeback. And don't even start writeback against filesystems whose queues are congested. - Convert pdflush's background_writeback() function to use nonblocking writeback. This way, a single pdflush thread will circulate around all the dirty queues, keeping them filled. - Convert the pdlfush `kupdate' function to do the same thing. This solves the problem of pdflush thread pool exhaustion. It solves the problem of pdflush startup latency. It solves the (minor) problem wherein `kupdate' writeback only writes back a single disk at a time (it was getting blocked on each queue in turn). It probably means that we only ever need a single pdflush thread.

[PATCH] use the congestion APIs in pdflush
The key concept here is that pdflush does not block on request queues any more. Instead, it circulates across the queues, keeping any non-congested queues full of write data. When all queues are full, pdflush takes a nap, to be woken when *any* queue exits write congestion. This code can keep sixty spindles saturated - we've never been able to do that before. - Add the `nonblocking' flag to struct writeback_control, and teach the writeback paths to honour it. - Add the `encountered_congestion' flag to struct writeback_control and teach the writeback paths to set it. So as soon as a mapping's backing_dev_info indicates that it is getting congested, bale out of writeback. And don't even start writeback against filesystems whose queues are congested. - Convert pdflush's background_writeback() function to use nonblocking writeback. This way, a single pdflush thread will circulate around all the dirty queues, keeping them filled. - Convert the pdlfush `kupdate' function to do the same thing. This solves the problem of pdflush thread pool exhaustion. It solves the problem of pdflush startup latency. It solves the (minor) problem wherein `kupdate' writeback only writes back a single disk at a time (it was getting blocked on each queue in turn). It probably means that we only ever need a single pdflush thread.
c9b22619 · Andrew Morton · Linus Torvalds · f3332384 · c9b22619 · c9b22619
Commit c9b22619 authored Sep 22, 2002 by Andrew Morton Committed by Linus Torvalds Sep 22, 2002
Showing with 66 additions and 26 deletions

fs/fs-writeback.c fs/fs-writeback.c +22 -18

fs/mpage.c fs/mpage.c +13 -0

include/linux/writeback.h include/linux/writeback.h +2 -0

mm/page-writeback.c mm/page-writeback.c +29 -8

No files found.
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -220,44 +220,52 @@ __writeback_single_inode(struct inode *inode, int sync,
 *
 * FIXME: this linear search could get expensive with many fileystems.  But
 * how to fix?  We need to go from an address_space to all inodes which share
- * a queue with that address_space.
+ * a queue with that address_space.  (Easy: have a global "dirty superblocks"
+ * list).
 *
 * The inodes to be written are parked on sb->s_io.  They are moved back onto
 * sb->s_dirty as they are selected for writing.  This way, none can be missed
 * on the writer throttling path, and we get decent balancing between many
- * thrlttled threads: we don't want them all piling up on __wait_on_inode.
+ * throlttled threads: we don't want them all piling up on __wait_on_inode.
 */
 static void
 sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 {
-	struct list_head *tmp;
-	struct list_head *head;
 	const unsigned long start = jiffies;	/* livelock avoidance */

 	list_splice_init(&sb->s_dirty, &sb->s_io);
-	head = &sb->s_io;
-	while ((tmp = head->prev) != head) {
-		struct inode *inode = list_entry(tmp, struct inode, i_list);
+	while (!list_empty(&sb->s_io)) {
+		struct inode *inode = list_entry(sb->s_io.prev,
+						struct inode, i_list);
 		struct address_space *mapping = inode->i_mapping;
-		struct backing_dev_info *bdi;
+		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		int really_sync;

-		if (wbc->bdi && mapping->backing_dev_info != wbc->bdi) {
+		if (wbc->nonblocking && bdi_write_congested(bdi)) {
+			wbc->encountered_congestion = 1;
 			if (sb != blockdev_superblock)
-				break;		/* inappropriate superblock */
+				break;		/* Skip a congested fs */
 			list_move(&inode->i_list, &sb->s_dirty);
-			continue;		/* not this blockdev */
+			continue;		/* Skip a congested blockdev */
+		}
+
+		if (wbc->bdi && bdi != wbc->bdi) {
+			if (sb != blockdev_superblock)
+				break;		/* fs has the wrong queue */
+			list_move(&inode->i_list, &sb->s_dirty);
+			continue;		/* blockdev has wrong queue */
 		}

 		/* Was this inode dirtied after sync_sb_inodes was called? */
 		if (time_after(mapping->dirtied_when, start))
 			break;

+		/* Was this inode dirtied too recently? */
 		if (wbc->older_than_this && time_after(mapping->dirtied_when,
 						*wbc->older_than_this))
-			goto out;
+			break;

-		bdi = mapping->backing_dev_info;
+		/* Is another pdflush already flushing this queue? */
 		if (current_is_pdflush() && !writeback_acquire(bdi))
 			break;

@@ -278,11 +286,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 		if (wbc->nr_to_write <= 0)
 			break;
 	}
-out:
-	/*
-	 * Leave any unwritten inodes on s_io.
-	 */
-	return;
+	return;		/* Leave any unwritten inodes on s_io */
 }

 /*

--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -22,6 +22,7 @@
 #include <linux/prefetch.h>
 #include <linux/mpage.h>
 #include <linux/writeback.h>
+#include <linux/backing-dev.h>
 #include <linux/pagevec.h>

 /*
@@ -522,6 +523,7 @@ int
 mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block)
 {
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	struct bio *bio = NULL;
 	sector_t last_block_in_bio = 0;
 	int ret = 0;
@@ -530,6 +532,12 @@ mpage_writepages(struct address_space *mapping,
 	struct pagevec pvec;
 	int (*writepage)(struct page *);

+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		blk_run_queues();
+		wbc->encountered_congestion = 1;
+		return 0;
+	}
+
 	writepage = NULL;
 	if (get_block == NULL)
 		writepage = mapping->a_ops->writepage;
@@ -585,6 +593,11 @@ mpage_writepages(struct address_space *mapping,
 			}
 			if (ret || (--(wbc->nr_to_write) <= 0))
 				done = 1;
+			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+				blk_run_queues();
+				wbc->encountered_congestion = 1;
+				done = 1;
+			}
 		} else {
 			unlock_page(page);
 		}

--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -43,6 +43,8 @@ struct writeback_control {
 					   older than this */
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
+	int nonblocking;		/* Don't get stuck on request queues */
+	int encountered_congestion;	/* An output: a queue is full */
 };
 	
 void writeback_inodes(struct writeback_control *wbc);

--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/sysrq.h>
 #include <linux/backing-dev.h>
+#include <linux/blkdev.h>
 #include <linux/mpage.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
@@ -172,21 +173,30 @@ static void background_writeout(unsigned long _min_pages)
 		.sync_mode	= WB_SYNC_NONE,
 		.older_than_this = NULL,
 		.nr_to_write	= 0,
+		.nonblocking	= 1,
 	};

 	CHECK_EMERGENCY_SYNC

 	background_thresh = (dirty_background_ratio * total_pages) / 100;
-
-	do {
+	for ( ; ; ) {
 		struct page_state ps;
+
 		get_page_state(&ps);
 		if (ps.nr_dirty < background_thresh && min_pages <= 0)
 			break;
+		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		writeback_inodes(&wbc);
 		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-	} while (wbc.nr_to_write <= 0);
+		if (wbc.nr_to_write == MAX_WRITEBACK_PAGES) {
+			/* Wrote nothing */
+			if (wbc.encountered_congestion)
+				blk_congestion_wait(WRITE, HZ/10);
+			else
+				break;
+		}
+	}
 	blk_run_queues();
 }

@@ -223,25 +233,36 @@ static void wb_kupdate(unsigned long arg)
 	unsigned long oldest_jif;
 	unsigned long start_jif;
 	unsigned long next_jif;
+	long nr_to_write;
 	struct page_state ps;
 	struct writeback_control wbc = {
 		.bdi		= NULL,
 		.sync_mode	= WB_SYNC_NONE,
 		.older_than_this = &oldest_jif,
 		.nr_to_write	= 0,
+		.nonblocking	= 1,
 	};

 	sync_supers();
-	get_page_state(&ps);

+	get_page_state(&ps);
 	oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
 	start_jif = jiffies;
 	next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
-	wbc.nr_to_write = ps.nr_dirty;
+	nr_to_write = ps.nr_dirty;
+	while (nr_to_write > 0) {
+		wbc.encountered_congestion = 0;
+		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		writeback_inodes(&wbc);
+		if (wbc.nr_to_write == MAX_WRITEBACK_PAGES) {
+			if (wbc.encountered_congestion)
+				blk_congestion_wait(WRITE, HZ);
+			else
+				break;	/* All the old data is written */
+		}
+		nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+	}
 	blk_run_queues();
-	yield();
-
 	if (time_before(next_jif, jiffies + HZ))
 		next_jif = jiffies + HZ;
 	mod_timer(&wb_timer, next_jif);