[PATCH] issues with online scheduler switching

There are two issues with online io scheduler switching that this patch addresses. The first is pretty simple - it concerns racing with scheduler removal on switch. elevator_find() does not grab a reference to the io scheduler, so before elevator_attach() is run it could go away. Add elevator_get() to solve that. Second issue is the flushing out of requests that is needed before switching can be problematic with requests that aren't allocated in the block layer (such as requests on the stack of a process). The problem is that we don't know when they will have finished, and most io schedulers need to access the elevator structures on io completion. This can be fixed by adding an intermedia step that switches to noop, since it doesn't need to touch anything but the request_queue. The queue drain can then safely be split into two operations - one that waits for file system requests, and one that waits for the queue to completely empty. Requests arriving after the first drain will get stuck in a seperate queue list. Signed-off-by: Jens Axboe <axboe@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

[PATCH] issues with online scheduler switching
There are two issues with online io scheduler switching that this patch addresses. The first is pretty simple - it concerns racing with scheduler removal on switch. elevator_find() does not grab a reference to the io scheduler, so before elevator_attach() is run it could go away. Add elevator_get() to solve that. Second issue is the flushing out of requests that is needed before switching can be problematic with requests that aren't allocated in the block layer (such as requests on the stack of a process). The problem is that we don't know when they will have finished, and most io schedulers need to access the elevator structures on io completion. This can be fixed by adding an intermedia step that switches to noop, since it doesn't need to touch anything but the request_queue. The queue drain can then safely be split into two operations - one that waits for file system requests, and one that waits for the queue to completely empty. Requests arriving after the first drain will get stuck in a seperate queue list. Signed-off-by: Jens Axboe <axboe@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
b743892d · Jens Axboe · Linus Torvalds · c8d60033 · b743892d · b743892d
Commit b743892d authored Oct 27, 2004 by Jens Axboe Committed by Linus Torvalds Oct 27, 2004
Show whitespace changes
Inline Side-by-side

Showing with 110 additions and 29 deletions

drivers/block/elevator.c drivers/block/elevator.c +80 -25

drivers/block/ll_rw_blk.c drivers/block/ll_rw_blk.c +27 -3

include/linux/blkdev.h include/linux/blkdev.h +3 -1

No files found.
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -113,14 +113,28 @@ struct elevator_type *elevator_find(const char *name)
 	return e;
 }
+static void elevator_put(struct elevator_type *e)
+{
+	module_put(e->elevator_owner);
+}
+static struct elevator_type *elevator_get(const char *name)
+{
+	struct elevator_type *e = elevator_find(name);
+	if (!e)
+		return NULL;
+	if (!try_module_get(e->elevator_owner))
+		return NULL;
+	return e;
+}
 static int elevator_attach(request_queue_t *q, struct elevator_type *e,
 			   struct elevator_queue *eq)
 {
 	int ret = 0;
-	if (!try_module_get(e->elevator_owner))
-		return -EINVAL;
 	memset(eq, 0, sizeof(*eq));
 	eq->ops = &e->ops;
 	eq->elevator_type = e;
@@ -156,7 +170,8 @@ static void elevator_setup_default(void)
 #else
 #error "You must build at least 1 IO scheduler into the kernel"
 #endif
-	printk("elevator: using %s as default io scheduler\n", chosen_elevator);
+	printk(KERN_INFO "elevator: using %s as default io scheduler\n",
+							chosen_elevator);
 }
 static int __init elevator_setup(char *str)
@@ -178,17 +193,21 @@ int elevator_init(request_queue_t *q, char *name)
 	if (!name)
 		name = chosen_elevator;
-	e = elevator_find(name);
+	e = elevator_get(name);
 	if (!e)
 		return -EINVAL;
 	eq = kmalloc(sizeof(struct elevator_queue), GFP_KERNEL);
-	if (!eq)
+	if (!eq) {
+		elevator_put(e->elevator_type);
 		return -ENOMEM;
+	}
 	ret = elevator_attach(q, e, eq);
-	if (ret)
+	if (ret) {
 		kfree(eq);
+		elevator_put(e->elevator_type);
+	}
 	return ret;
 }
@@ -198,7 +217,7 @@ void elevator_exit(elevator_t *e)
 	if (e->ops->elevator_exit_fn)
 		e->ops->elevator_exit_fn(e);
-	module_put(e->elevator_type->elevator_owner);
+	elevator_put(e->elevator_type);
 	e->elevator_type = NULL;
 	kfree(e);
 }
@@ -271,15 +290,24 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where,
 		blk_plug_device(q);
 	rq->q = q;
+	if (!test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)) {
 		q->elevator->ops->elevator_add_req_fn(q, rq, where);
 		if (blk_queue_plugged(q)) {
-		int nrq = q->rq.count[READ] + q->rq.count[WRITE] - q->in_flight;
+			int nrq = q->rq.count[READ] + q->rq.count[WRITE]
+				  - q->in_flight;
 			if (nrq == q->unplug_thresh)
 				__generic_unplug_device(q);
 		}
+	} else
+		/*
+		 * if drain is set, store the request "locally". when the drain
+		 * is finished, the requests will be handed ordered to the io
+		 * scheduler
+		 */
+		list_add_tail(&rq->queuelist, &q->drain_list);
 }
 void elv_add_request(request_queue_t *q, struct request *rq, int where,
@@ -333,7 +361,8 @@ struct request *elv_next_request(request_queue_t *q)
 			end_that_request_chunk(rq, 0, nr_bytes);
 			end_that_request_last(rq);
 		} else {
-			printk("%s: bad return=%d\n", __FUNCTION__, ret);
+			printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__,
+								ret);
 			break;
 		}
 	}
@@ -486,7 +515,7 @@ int elv_register(struct elevator_type *e)
 	list_add_tail(&e->list, &elv_list);
 	spin_unlock_irq(&elv_list_lock);
-	printk("io scheduler %s registered\n", e->elevator_name);
+	printk(KERN_INFO "io scheduler %s registered\n", e->elevator_name);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(elv_register);
@@ -503,19 +532,25 @@ EXPORT_SYMBOL_GPL(elv_unregister);
 * switch to new_e io scheduler. be careful not to introduce deadlocks -
 * we don't free the old io scheduler, before we have allocated what we
 * need for the new one. this way we have a chance of going back to the old
- * one, if the new one fails init for some reason
+ * one, if the new one fails init for some reason. we also do an intermediate
+ * switch to noop to ensure safety with stack-allocated requests, since they
+ * don't originate from the block layer allocator. noop is safe here, because
+ * it never needs to touch the elevator itself for completion events. DRAIN
+ * flags will make sure we don't touch it for additions either.
 */
 static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 {
 	elevator_t *e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
+	struct elevator_type *noop_elevator = NULL;
 	elevator_t *old_elevator;
-	if (!e) {
+	if (!e)
-		printk("elevator: out of memory\n");
+		goto error;
-		return;
-	}
-	blk_wait_queue_drained(q);
+	/*
+	 * first step, drain requests from the block freelist
+	 */
+	blk_wait_queue_drained(q, 0);
 	/*
 	 * unregister old elevator data
@@ -523,6 +558,18 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 	elv_unregister_queue(q);
 	old_elevator = q->elevator;
+	/*
+ 	 * next step, switch to noop since it uses no private rq structures
+	 * and doesn't allocate any memory for anything. then wait for any
+	 * non-fs requests in-flight
+ 	 */
+	noop_elevator = elevator_get("noop");
+	spin_lock_irq(q->queue_lock);
+	elevator_attach(q, noop_elevator, e);
+	spin_unlock_irq(q->queue_lock);
+	blk_wait_queue_drained(q, 1);
 	/*
 	 * attach and start new elevator
 	 */
@@ -537,6 +584,7 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 	 */
 	elevator_exit(old_elevator);
 	blk_finish_queue_drain(q);
+	elevator_put(noop_elevator);
 	return;
 fail_register:
@@ -549,7 +597,11 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 	q->elevator = old_elevator;
 	elv_register_queue(q);
 	blk_finish_queue_drain(q);
-	printk("elevator: switch to %s failed\n", new_e->elevator_name);
+error:
+	if (noop_elevator)
+		elevator_put(noop_elevator);
+	elevator_put(new_e);
+	printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name);
 }
 ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
@@ -563,12 +615,15 @@ ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
 	if (elevator_name[strlen(elevator_name) - 1] == '\n')
 		elevator_name[strlen(elevator_name) - 1] = '\0';
-	e = elevator_find(elevator_name);
+	e = elevator_get(elevator_name);
 	if (!e) {
-		printk("elevator: type %s not found\n", elevator_name);
+		printk(KERN_ERR "elevator: type %s not found\n", elevator_name);
 		return -EINVAL;
 	}
+	if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name))
+		return count;
 	elevator_switch(q, e);
 	return count;
 }

--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -261,6 +261,8 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
 	blk_queue_activity_fn(q, NULL, NULL);
+	INIT_LIST_HEAD(&q->drain_list);
 }
 EXPORT_SYMBOL(blk_queue_make_request);
@@ -2481,20 +2483,42 @@ static inline void blk_partition_remap(struct bio *bio)
 void blk_finish_queue_drain(request_queue_t *q)
 {
 	struct request_list *rl = &q->rq;
+	struct request *rq;
+	spin_lock_irq(q->queue_lock);
 	clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
+	while (!list_empty(&q->drain_list)) {
+		rq = list_entry_rq(q->drain_list.next);
+		list_del_init(&rq->queuelist);
+		__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
+	}
+	spin_unlock_irq(q->queue_lock);
 	wake_up(&rl->wait[0]);
 	wake_up(&rl->wait[1]);
 	wake_up(&rl->drain);
 }
+static int wait_drain(request_queue_t *q, struct request_list *rl, int dispatch)
+{
+	int wait = rl->count[READ] + rl->count[WRITE];
+	if (dispatch)
+		wait += !list_empty(&q->queue_head);
+	return wait;
+}
 /*
 * We rely on the fact that only requests allocated through blk_alloc_request()
 * have io scheduler private data structures associated with them. Any other
 * type of request (allocated on stack or through kmalloc()) should not go
 * to the io scheduler core, but be attached to the queue head instead.
 */
-void blk_wait_queue_drained(request_queue_t *q)
+void blk_wait_queue_drained(request_queue_t *q, int wait_dispatch)
 {
 	struct request_list *rl = &q->rq;
 	DEFINE_WAIT(wait);
@@ -2502,10 +2526,10 @@ void blk_wait_queue_drained(request_queue_t *q)
 	spin_lock_irq(q->queue_lock);
 	set_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
-	while (rl->count[READ] || rl->count[WRITE]) {
+	while (wait_drain(q, rl, wait_dispatch)) {
 		prepare_to_wait(&rl->drain, &wait, TASK_UNINTERRUPTIBLE);
-		if (rl->count[READ] || rl->count[WRITE]) {
+		if (wait_drain(q, rl, wait_dispatch)) {
 			__generic_unplug_device(q);
 			spin_unlock_irq(q->queue_lock);
 			io_schedule();

--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -377,6 +377,8 @@ struct request_queue
 	 */
 	unsigned int		sg_timeout;
 	unsigned int		sg_reserved_size;
+	struct list_head	drain_list;
 };
 #define RQ_INACTIVE		(-1)
@@ -604,7 +606,7 @@ extern void blk_dump_rq_flags(struct request *, char *);
 extern void generic_unplug_device(request_queue_t *);
 extern void __generic_unplug_device(request_queue_t *);
 extern long nr_blockdev_pages(void);
-extern void blk_wait_queue_drained(request_queue_t *);
+extern void blk_wait_queue_drained(request_queue_t *, int);
 extern void blk_finish_queue_drain(request_queue_t *);
 int blk_get_queue(request_queue_t *);