Commit b743892d authored by Jens Axboe's avatar Jens Axboe Committed by Linus Torvalds

[PATCH] issues with online scheduler switching

There are two issues with online io scheduler switching that this patch
addresses.  The first is pretty simple - it concerns racing with scheduler
removal on switch.  elevator_find() does not grab a reference to the io
scheduler, so before elevator_attach() is run it could go away.  Add
elevator_get() to solve that.

Second issue is the flushing out of requests that is needed before
switching can be problematic with requests that aren't allocated in the
block layer (such as requests on the stack of a process).  The problem is
that we don't know when they will have finished, and most io schedulers
need to access the elevator structures on io completion.  This can be fixed
by adding an intermedia step that switches to noop, since it doesn't need
to touch anything but the request_queue.  The queue drain can then safely
be split into two operations - one that waits for file system requests, and
one that waits for the queue to completely empty.  Requests arriving after
the first drain will get stuck in a seperate queue list.
Signed-off-by: default avatarJens Axboe <axboe@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent c8d60033
......@@ -113,14 +113,28 @@ struct elevator_type *elevator_find(const char *name)
return e;
}
static void elevator_put(struct elevator_type *e)
{
module_put(e->elevator_owner);
}
static struct elevator_type *elevator_get(const char *name)
{
struct elevator_type *e = elevator_find(name);
if (!e)
return NULL;
if (!try_module_get(e->elevator_owner))
return NULL;
return e;
}
static int elevator_attach(request_queue_t *q, struct elevator_type *e,
struct elevator_queue *eq)
{
int ret = 0;
if (!try_module_get(e->elevator_owner))
return -EINVAL;
memset(eq, 0, sizeof(*eq));
eq->ops = &e->ops;
eq->elevator_type = e;
......@@ -156,7 +170,8 @@ static void elevator_setup_default(void)
#else
#error "You must build at least 1 IO scheduler into the kernel"
#endif
printk("elevator: using %s as default io scheduler\n", chosen_elevator);
printk(KERN_INFO "elevator: using %s as default io scheduler\n",
chosen_elevator);
}
static int __init elevator_setup(char *str)
......@@ -178,17 +193,21 @@ int elevator_init(request_queue_t *q, char *name)
if (!name)
name = chosen_elevator;
e = elevator_find(name);
e = elevator_get(name);
if (!e)
return -EINVAL;
eq = kmalloc(sizeof(struct elevator_queue), GFP_KERNEL);
if (!eq)
if (!eq) {
elevator_put(e->elevator_type);
return -ENOMEM;
}
ret = elevator_attach(q, e, eq);
if (ret)
if (ret) {
kfree(eq);
elevator_put(e->elevator_type);
}
return ret;
}
......@@ -198,7 +217,7 @@ void elevator_exit(elevator_t *e)
if (e->ops->elevator_exit_fn)
e->ops->elevator_exit_fn(e);
module_put(e->elevator_type->elevator_owner);
elevator_put(e->elevator_type);
e->elevator_type = NULL;
kfree(e);
}
......@@ -271,15 +290,24 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where,
blk_plug_device(q);
rq->q = q;
if (!test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)) {
q->elevator->ops->elevator_add_req_fn(q, rq, where);
if (blk_queue_plugged(q)) {
int nrq = q->rq.count[READ] + q->rq.count[WRITE] - q->in_flight;
int nrq = q->rq.count[READ] + q->rq.count[WRITE]
- q->in_flight;
if (nrq == q->unplug_thresh)
__generic_unplug_device(q);
}
} else
/*
* if drain is set, store the request "locally". when the drain
* is finished, the requests will be handed ordered to the io
* scheduler
*/
list_add_tail(&rq->queuelist, &q->drain_list);
}
void elv_add_request(request_queue_t *q, struct request *rq, int where,
......@@ -333,7 +361,8 @@ struct request *elv_next_request(request_queue_t *q)
end_that_request_chunk(rq, 0, nr_bytes);
end_that_request_last(rq);
} else {
printk("%s: bad return=%d\n", __FUNCTION__, ret);
printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__,
ret);
break;
}
}
......@@ -486,7 +515,7 @@ int elv_register(struct elevator_type *e)
list_add_tail(&e->list, &elv_list);
spin_unlock_irq(&elv_list_lock);
printk("io scheduler %s registered\n", e->elevator_name);
printk(KERN_INFO "io scheduler %s registered\n", e->elevator_name);
return 0;
}
EXPORT_SYMBOL_GPL(elv_register);
......@@ -503,19 +532,25 @@ EXPORT_SYMBOL_GPL(elv_unregister);
* switch to new_e io scheduler. be careful not to introduce deadlocks -
* we don't free the old io scheduler, before we have allocated what we
* need for the new one. this way we have a chance of going back to the old
* one, if the new one fails init for some reason
* one, if the new one fails init for some reason. we also do an intermediate
* switch to noop to ensure safety with stack-allocated requests, since they
* don't originate from the block layer allocator. noop is safe here, because
* it never needs to touch the elevator itself for completion events. DRAIN
* flags will make sure we don't touch it for additions either.
*/
static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
{
elevator_t *e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
struct elevator_type *noop_elevator = NULL;
elevator_t *old_elevator;
if (!e) {
printk("elevator: out of memory\n");
return;
}
if (!e)
goto error;
blk_wait_queue_drained(q);
/*
* first step, drain requests from the block freelist
*/
blk_wait_queue_drained(q, 0);
/*
* unregister old elevator data
......@@ -523,6 +558,18 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
elv_unregister_queue(q);
old_elevator = q->elevator;
/*
* next step, switch to noop since it uses no private rq structures
* and doesn't allocate any memory for anything. then wait for any
* non-fs requests in-flight
*/
noop_elevator = elevator_get("noop");
spin_lock_irq(q->queue_lock);
elevator_attach(q, noop_elevator, e);
spin_unlock_irq(q->queue_lock);
blk_wait_queue_drained(q, 1);
/*
* attach and start new elevator
*/
......@@ -537,6 +584,7 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
*/
elevator_exit(old_elevator);
blk_finish_queue_drain(q);
elevator_put(noop_elevator);
return;
fail_register:
......@@ -549,7 +597,11 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
q->elevator = old_elevator;
elv_register_queue(q);
blk_finish_queue_drain(q);
printk("elevator: switch to %s failed\n", new_e->elevator_name);
error:
if (noop_elevator)
elevator_put(noop_elevator);
elevator_put(new_e);
printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name);
}
ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
......@@ -563,12 +615,15 @@ ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
if (elevator_name[strlen(elevator_name) - 1] == '\n')
elevator_name[strlen(elevator_name) - 1] = '\0';
e = elevator_find(elevator_name);
e = elevator_get(elevator_name);
if (!e) {
printk("elevator: type %s not found\n", elevator_name);
printk(KERN_ERR "elevator: type %s not found\n", elevator_name);
return -EINVAL;
}
if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name))
return count;
elevator_switch(q, e);
return count;
}
......
......@@ -261,6 +261,8 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
blk_queue_activity_fn(q, NULL, NULL);
INIT_LIST_HEAD(&q->drain_list);
}
EXPORT_SYMBOL(blk_queue_make_request);
......@@ -2481,20 +2483,42 @@ static inline void blk_partition_remap(struct bio *bio)
void blk_finish_queue_drain(request_queue_t *q)
{
struct request_list *rl = &q->rq;
struct request *rq;
spin_lock_irq(q->queue_lock);
clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
while (!list_empty(&q->drain_list)) {
rq = list_entry_rq(q->drain_list.next);
list_del_init(&rq->queuelist);
__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
}
spin_unlock_irq(q->queue_lock);
wake_up(&rl->wait[0]);
wake_up(&rl->wait[1]);
wake_up(&rl->drain);
}
static int wait_drain(request_queue_t *q, struct request_list *rl, int dispatch)
{
int wait = rl->count[READ] + rl->count[WRITE];
if (dispatch)
wait += !list_empty(&q->queue_head);
return wait;
}
/*
* We rely on the fact that only requests allocated through blk_alloc_request()
* have io scheduler private data structures associated with them. Any other
* type of request (allocated on stack or through kmalloc()) should not go
* to the io scheduler core, but be attached to the queue head instead.
*/
void blk_wait_queue_drained(request_queue_t *q)
void blk_wait_queue_drained(request_queue_t *q, int wait_dispatch)
{
struct request_list *rl = &q->rq;
DEFINE_WAIT(wait);
......@@ -2502,10 +2526,10 @@ void blk_wait_queue_drained(request_queue_t *q)
spin_lock_irq(q->queue_lock);
set_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
while (rl->count[READ] || rl->count[WRITE]) {
while (wait_drain(q, rl, wait_dispatch)) {
prepare_to_wait(&rl->drain, &wait, TASK_UNINTERRUPTIBLE);
if (rl->count[READ] || rl->count[WRITE]) {
if (wait_drain(q, rl, wait_dispatch)) {
__generic_unplug_device(q);
spin_unlock_irq(q->queue_lock);
io_schedule();
......
......@@ -377,6 +377,8 @@ struct request_queue
*/
unsigned int sg_timeout;
unsigned int sg_reserved_size;
struct list_head drain_list;
};
#define RQ_INACTIVE (-1)
......@@ -604,7 +606,7 @@ extern void blk_dump_rq_flags(struct request *, char *);
extern void generic_unplug_device(request_queue_t *);
extern void __generic_unplug_device(request_queue_t *);
extern long nr_blockdev_pages(void);
extern void blk_wait_queue_drained(request_queue_t *);
extern void blk_wait_queue_drained(request_queue_t *, int);
extern void blk_finish_queue_drain(request_queue_t *);
int blk_get_queue(request_queue_t *);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment