[PATCH] self-unplugging request queues

The patch teaches a queue to unplug itself: a) if is has four requests OR b) if it has had plugged requests for 3 milliseconds. These numbers may need to be tuned, although doing so doesn't seem to make much difference. 10 msecs works OK, so HZ=100 machines will be fine. Instrumentation shows that about 5-10% of requests were started due to the three millisecond timeout (during a kernel compile). That's somewhat significant. It means that the kernel is leaving stuff in the queue, plugged, for too long. This testing was with a uniprocessor preemptible kernel, which is particularly vulnerable to unplug latency (submit some IO, get preempted before the unplug). This patch permits the removal of a lot of rather lame unplugging in page reclaim and in the writeback code, which kicks the queues (globally!) every four megabytes to get writeback underway. This patch doesn't use blk_run_queues(). It is able to kick just the particular queue. The patch is not expected to make much difference really, except for AIO. AIO needs a blk_run_queues() in its io_submit() call. For each request. This means that AIO has to disable plugging altogether, unless something like this patch does it for it. It means that AIO will unplug *all* queues in the machine for every io_submit(). Even against a socket! This patch was tested by disabling blk_run_queues() completely. The system ran OK. The 3 milliseconds may be too long. It's OK for the heavy writeback code, but AIO may want less. Or maybe AIO really wants zero (ie: disable plugging). If that is so, we need new code paths by which AIO can communicate the "immediate unplug" information - a global unplug is not good. To minimise unplug latency due to user CPU load, this patch gives keventd `nice -10'. This is of course completely arbitrary. Really, I think keventd should be SCHED_RR/MAX_RT_PRIO-1, as it has been in -aa kernels for ages.

[PATCH] self-unplugging request queues
The patch teaches a queue to unplug itself: a) if is has four requests OR b) if it has had plugged requests for 3 milliseconds. These numbers may need to be tuned, although doing so doesn't seem to make much difference. 10 msecs works OK, so HZ=100 machines will be fine. Instrumentation shows that about 5-10% of requests were started due to the three millisecond timeout (during a kernel compile). That's somewhat significant. It means that the kernel is leaving stuff in the queue, plugged, for too long. This testing was with a uniprocessor preemptible kernel, which is particularly vulnerable to unplug latency (submit some IO, get preempted before the unplug). This patch permits the removal of a lot of rather lame unplugging in page reclaim and in the writeback code, which kicks the queues (globally!) every four megabytes to get writeback underway. This patch doesn't use blk_run_queues(). It is able to kick just the particular queue. The patch is not expected to make much difference really, except for AIO. AIO needs a blk_run_queues() in its io_submit() call. For each request. This means that AIO has to disable plugging altogether, unless something like this patch does it for it. It means that AIO will unplug *all* queues in the machine for every io_submit(). Even against a socket! This patch was tested by disabling blk_run_queues() completely. The system ran OK. The 3 milliseconds may be too long. It's OK for the heavy writeback code, but AIO may want less. Or maybe AIO really wants zero (ie: disable plugging). If that is so, we need new code paths by which AIO can communicate the "immediate unplug" information - a global unplug is not good. To minimise unplug latency due to user CPU load, this patch gives keventd `nice -10'. This is of course completely arbitrary. Really, I think keventd should be SCHED_RR/MAX_RT_PRIO-1, as it has been in -aa kernels for ages.
00c8e791 · Andrew Morton · Linus Torvalds · c5070032 · 00c8e791 · 00c8e791
Commit 00c8e791 authored Feb 05, 2003 by Andrew Morton Committed by Linus Torvalds Feb 05, 2003
Hide whitespace changes
Inline Side-by-side

Showing with 51 additions and 0 deletions

drivers/block/ll_rw_blk.c drivers/block/ll_rw_blk.c +40 -0

include/linux/blkdev.h include/linux/blkdev.h +10 -0

kernel/workqueue.c kernel/workqueue.c +1 -0

No files found.
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -27,6 +27,8 @@
 #include <linux/completion.h>
 #include <linux/slab.h>

+static void blk_unplug_work(void *data);
+
 /*
 * For the allocated request tables
 */
@@ -237,6 +239,14 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 	blk_queue_hardsect_size(q, 512);
 	blk_queue_dma_alignment(q, 511);

+	q->unplug_thresh = 4;		/* hmm */
+	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
+	if (q->unplug_delay == 0)
+		q->unplug_delay = 1;
+
+	init_timer(&q->unplug_timer);
+	INIT_WORK(&q->unplug_work, blk_unplug_work, q);
+
 	/*
 	 * by default assume old behaviour and bounce for any highmem page
 	 */
@@ -960,6 +970,7 @@ void blk_plug_device(request_queue_t *q)
 	if (!blk_queue_plugged(q)) {
 		spin_lock(&blk_plug_lock);
 		list_add_tail(&q->plug_list, &blk_plug_list);
+		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
 		spin_unlock(&blk_plug_lock);
 	}
 }
@@ -974,6 +985,7 @@ int blk_remove_plug(request_queue_t *q)
 	if (blk_queue_plugged(q)) {
 		spin_lock(&blk_plug_lock);
 		list_del_init(&q->plug_list);
+		del_timer(&q->unplug_timer);
 		spin_unlock(&blk_plug_lock);
 		return 1;
 	}
@@ -992,6 +1004,8 @@ static inline void __generic_unplug_device(request_queue_t *q)
 	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
 		return;

+	del_timer(&q->unplug_timer);
+
 	/*
 	 * was plugged, fire request_fn if queue has stuff to do
 	 */
@@ -1020,6 +1034,18 @@ void generic_unplug_device(void *data)
 	spin_unlock_irq(q->queue_lock);
 }

+static void blk_unplug_work(void *data)
+{
+	generic_unplug_device(data);
+}
+
+static void blk_unplug_timeout(unsigned long data)
+{
+	request_queue_t *q = (request_queue_t *)data;
+
+	schedule_work(&q->unplug_work);
+}
+
 /**
 * blk_start_queue - restart a previously stopped queue
 * @q:    The &request_queue_t in question
@@ -1164,6 +1190,9 @@ void blk_cleanup_queue(request_queue_t * q)
 	count -= __blk_cleanup_queue(&q->rq[READ]);
 	count -= __blk_cleanup_queue(&q->rq[WRITE]);

+	del_timer_sync(&q->unplug_timer);
+	flush_scheduled_work();
+
 	if (count)
 		printk("blk_cleanup_queue: leaked requests (%d)\n", count);

@@ -1269,6 +1298,9 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock)
 	blk_queue_make_request(q, __make_request);
 	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);

+	q->unplug_timer.function = blk_unplug_timeout;
+	q->unplug_timer.data = (unsigned long)q;
+
 	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
 	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);

@@ -1811,7 +1843,15 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 out:
 	if (freereq)
 		__blk_put_request(q, freereq);
+
+	if (blk_queue_plugged(q)) {
+		int nr_queued = (queue_nr_requests - q->rq[0].count) +
+				(queue_nr_requests - q->rq[1].count);
+		if (nr_queued == q->unplug_thresh)
+			__generic_unplug_device(q);
+	}
 	spin_unlock_irq(q->queue_lock);
+
 	return 0;

 end_io:

--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -4,6 +4,8 @@
 #include <linux/major.h>
 #include <linux/genhd.h>
 #include <linux/list.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
 #include <linux/pagemap.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
@@ -188,6 +190,14 @@ struct request_queue
 	unplug_fn		*unplug_fn;
 	merge_bvec_fn		*merge_bvec_fn;

+	/*
+	 * Auto-unplugging state
+	 */
+	struct timer_list	unplug_timer;
+	int			unplug_thresh;	/* After this many requests */
+	unsigned long		unplug_delay;	/* After this many jiffies */
+	struct work_struct	unplug_work;
+
 	struct backing_dev_info	backing_dev_info;

 	/*

--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -177,6 +177,7 @@ static int worker_thread(void *__startup)
 	current->flags |= PF_IOTHREAD;
 	cwq->thread = current;

+	set_user_nice(current, -10);
 	set_cpus_allowed(current, 1UL << cpu);

 	spin_lock_irq(&current->sig->siglock);