Commit 22e2c507 authored by Jens Axboe's avatar Jens Axboe Committed by Linus Torvalds

[PATCH] Update cfq io scheduler to time sliced design

This updates the CFQ io scheduler to the new time sliced design (cfq
v3).  It provides full process fairness, while giving excellent
aggregate system throughput even for many competing processes.  It
supports io priorities, either inherited from the cpu nice value or set
directly with the ioprio_get/set syscalls.  The latter closely mimic
set/getpriority.

This import is based on my latest from -mm.
Signed-off-by: default avatarJens Axboe <axboe@suse.de>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 020f46a3
...@@ -289,3 +289,5 @@ ENTRY(sys_call_table) ...@@ -289,3 +289,5 @@ ENTRY(sys_call_table)
.long sys_add_key .long sys_add_key
.long sys_request_key .long sys_request_key
.long sys_keyctl .long sys_keyctl
.long sys_ioprio_set
.long sys_ioprio_get /* 290 */
...@@ -1577,8 +1577,8 @@ sys_call_table: ...@@ -1577,8 +1577,8 @@ sys_call_table:
data8 sys_add_key data8 sys_add_key
data8 sys_request_key data8 sys_request_key
data8 sys_keyctl data8 sys_keyctl
data8 sys_ni_syscall data8 sys_ioprio_set
data8 sys_ni_syscall // 1275 data8 sys_ioprio_get // 1275
data8 sys_set_zone_reclaim data8 sys_set_zone_reclaim
data8 sys_ni_syscall data8 sys_ni_syscall
data8 sys_ni_syscall data8 sys_ni_syscall
......
...@@ -1449,3 +1449,5 @@ _GLOBAL(sys_call_table) ...@@ -1449,3 +1449,5 @@ _GLOBAL(sys_call_table)
.long sys_request_key /* 270 */ .long sys_request_key /* 270 */
.long sys_keyctl .long sys_keyctl
.long sys_waitid .long sys_waitid
.long sys_ioprio_set
.long sys_ioprio_get
...@@ -1806,7 +1806,8 @@ static void as_put_request(request_queue_t *q, struct request *rq) ...@@ -1806,7 +1806,8 @@ static void as_put_request(request_queue_t *q, struct request *rq)
rq->elevator_private = NULL; rq->elevator_private = NULL;
} }
static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask) static int as_set_request(request_queue_t *q, struct request *rq,
struct bio *bio, int gfp_mask)
{ {
struct as_data *ad = q->elevator->elevator_data; struct as_data *ad = q->elevator->elevator_data;
struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask); struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
...@@ -1827,7 +1828,7 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask) ...@@ -1827,7 +1828,7 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
return 1; return 1;
} }
static int as_may_queue(request_queue_t *q, int rw) static int as_may_queue(request_queue_t *q, int rw, struct bio *bio)
{ {
int ret = ELV_MQUEUE_MAY; int ret = ELV_MQUEUE_MAY;
struct as_data *ad = q->elevator->elevator_data; struct as_data *ad = q->elevator->elevator_data;
......
This diff is collapsed.
...@@ -760,7 +760,8 @@ static void deadline_put_request(request_queue_t *q, struct request *rq) ...@@ -760,7 +760,8 @@ static void deadline_put_request(request_queue_t *q, struct request *rq)
} }
static int static int
deadline_set_request(request_queue_t *q, struct request *rq, int gfp_mask) deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
int gfp_mask)
{ {
struct deadline_data *dd = q->elevator->elevator_data; struct deadline_data *dd = q->elevator->elevator_data;
struct deadline_rq *drq; struct deadline_rq *drq;
......
...@@ -486,12 +486,13 @@ struct request *elv_former_request(request_queue_t *q, struct request *rq) ...@@ -486,12 +486,13 @@ struct request *elv_former_request(request_queue_t *q, struct request *rq)
return NULL; return NULL;
} }
int elv_set_request(request_queue_t *q, struct request *rq, int gfp_mask) int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
int gfp_mask)
{ {
elevator_t *e = q->elevator; elevator_t *e = q->elevator;
if (e->ops->elevator_set_req_fn) if (e->ops->elevator_set_req_fn)
return e->ops->elevator_set_req_fn(q, rq, gfp_mask); return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask);
rq->elevator_private = NULL; rq->elevator_private = NULL;
return 0; return 0;
...@@ -505,12 +506,12 @@ void elv_put_request(request_queue_t *q, struct request *rq) ...@@ -505,12 +506,12 @@ void elv_put_request(request_queue_t *q, struct request *rq)
e->ops->elevator_put_req_fn(q, rq); e->ops->elevator_put_req_fn(q, rq);
} }
int elv_may_queue(request_queue_t *q, int rw) int elv_may_queue(request_queue_t *q, int rw, struct bio *bio)
{ {
elevator_t *e = q->elevator; elevator_t *e = q->elevator;
if (e->ops->elevator_may_queue_fn) if (e->ops->elevator_may_queue_fn)
return e->ops->elevator_may_queue_fn(q, rw); return e->ops->elevator_may_queue_fn(q, rw, bio);
return ELV_MQUEUE_MAY; return ELV_MQUEUE_MAY;
} }
......
...@@ -276,6 +276,7 @@ static inline void rq_init(request_queue_t *q, struct request *rq) ...@@ -276,6 +276,7 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
rq->errors = 0; rq->errors = 0;
rq->rq_status = RQ_ACTIVE; rq->rq_status = RQ_ACTIVE;
rq->bio = rq->biotail = NULL; rq->bio = rq->biotail = NULL;
rq->ioprio = 0;
rq->buffer = NULL; rq->buffer = NULL;
rq->ref_count = 1; rq->ref_count = 1;
rq->q = q; rq->q = q;
...@@ -1442,11 +1443,7 @@ void __generic_unplug_device(request_queue_t *q) ...@@ -1442,11 +1443,7 @@ void __generic_unplug_device(request_queue_t *q)
if (!blk_remove_plug(q)) if (!blk_remove_plug(q))
return; return;
/* q->request_fn(q);
* was plugged, fire request_fn if queue has stuff to do
*/
if (elv_next_request(q))
q->request_fn(q);
} }
EXPORT_SYMBOL(__generic_unplug_device); EXPORT_SYMBOL(__generic_unplug_device);
...@@ -1776,8 +1773,8 @@ static inline void blk_free_request(request_queue_t *q, struct request *rq) ...@@ -1776,8 +1773,8 @@ static inline void blk_free_request(request_queue_t *q, struct request *rq)
mempool_free(rq, q->rq.rq_pool); mempool_free(rq, q->rq.rq_pool);
} }
static inline struct request *blk_alloc_request(request_queue_t *q, int rw, static inline struct request *
int gfp_mask) blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask)
{ {
struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
...@@ -1790,7 +1787,7 @@ static inline struct request *blk_alloc_request(request_queue_t *q, int rw, ...@@ -1790,7 +1787,7 @@ static inline struct request *blk_alloc_request(request_queue_t *q, int rw,
*/ */
rq->flags = rw; rq->flags = rw;
if (!elv_set_request(q, rq, gfp_mask)) if (!elv_set_request(q, rq, bio, gfp_mask))
return rq; return rq;
mempool_free(rq, q->rq.rq_pool); mempool_free(rq, q->rq.rq_pool);
...@@ -1872,7 +1869,8 @@ static void freed_request(request_queue_t *q, int rw) ...@@ -1872,7 +1869,8 @@ static void freed_request(request_queue_t *q, int rw)
/* /*
* Get a free request, queue_lock must not be held * Get a free request, queue_lock must not be held
*/ */
static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
int gfp_mask)
{ {
struct request *rq = NULL; struct request *rq = NULL;
struct request_list *rl = &q->rq; struct request_list *rl = &q->rq;
...@@ -1895,7 +1893,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) ...@@ -1895,7 +1893,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
} }
} }
switch (elv_may_queue(q, rw)) { switch (elv_may_queue(q, rw, bio)) {
case ELV_MQUEUE_NO: case ELV_MQUEUE_NO:
goto rq_starved; goto rq_starved;
case ELV_MQUEUE_MAY: case ELV_MQUEUE_MAY:
...@@ -1920,7 +1918,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) ...@@ -1920,7 +1918,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
set_queue_congested(q, rw); set_queue_congested(q, rw);
spin_unlock_irq(q->queue_lock); spin_unlock_irq(q->queue_lock);
rq = blk_alloc_request(q, rw, gfp_mask); rq = blk_alloc_request(q, rw, bio, gfp_mask);
if (!rq) { if (!rq) {
/* /*
* Allocation failed presumably due to memory. Undo anything * Allocation failed presumably due to memory. Undo anything
...@@ -1961,7 +1959,8 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) ...@@ -1961,7 +1959,8 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
* No available requests for this queue, unplug the device and wait for some * No available requests for this queue, unplug the device and wait for some
* requests to become available. * requests to become available.
*/ */
static struct request *get_request_wait(request_queue_t *q, int rw) static struct request *get_request_wait(request_queue_t *q, int rw,
struct bio *bio)
{ {
DEFINE_WAIT(wait); DEFINE_WAIT(wait);
struct request *rq; struct request *rq;
...@@ -1972,7 +1971,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw) ...@@ -1972,7 +1971,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
prepare_to_wait_exclusive(&rl->wait[rw], &wait, prepare_to_wait_exclusive(&rl->wait[rw], &wait,
TASK_UNINTERRUPTIBLE); TASK_UNINTERRUPTIBLE);
rq = get_request(q, rw, GFP_NOIO); rq = get_request(q, rw, bio, GFP_NOIO);
if (!rq) { if (!rq) {
struct io_context *ioc; struct io_context *ioc;
...@@ -2003,9 +2002,9 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask) ...@@ -2003,9 +2002,9 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask)
BUG_ON(rw != READ && rw != WRITE); BUG_ON(rw != READ && rw != WRITE);
if (gfp_mask & __GFP_WAIT) if (gfp_mask & __GFP_WAIT)
rq = get_request_wait(q, rw); rq = get_request_wait(q, rw, NULL);
else else
rq = get_request(q, rw, gfp_mask); rq = get_request(q, rw, NULL, gfp_mask);
return rq; return rq;
} }
...@@ -2333,7 +2332,6 @@ static void __blk_put_request(request_queue_t *q, struct request *req) ...@@ -2333,7 +2332,6 @@ static void __blk_put_request(request_queue_t *q, struct request *req)
return; return;
req->rq_status = RQ_INACTIVE; req->rq_status = RQ_INACTIVE;
req->q = NULL;
req->rl = NULL; req->rl = NULL;
/* /*
...@@ -2462,6 +2460,8 @@ static int attempt_merge(request_queue_t *q, struct request *req, ...@@ -2462,6 +2460,8 @@ static int attempt_merge(request_queue_t *q, struct request *req,
req->rq_disk->in_flight--; req->rq_disk->in_flight--;
} }
req->ioprio = ioprio_best(req->ioprio, next->ioprio);
__blk_put_request(q, next); __blk_put_request(q, next);
return 1; return 1;
} }
...@@ -2514,11 +2514,13 @@ static int __make_request(request_queue_t *q, struct bio *bio) ...@@ -2514,11 +2514,13 @@ static int __make_request(request_queue_t *q, struct bio *bio)
{ {
struct request *req, *freereq = NULL; struct request *req, *freereq = NULL;
int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync; int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;
unsigned short prio;
sector_t sector; sector_t sector;
sector = bio->bi_sector; sector = bio->bi_sector;
nr_sectors = bio_sectors(bio); nr_sectors = bio_sectors(bio);
cur_nr_sectors = bio_cur_sectors(bio); cur_nr_sectors = bio_cur_sectors(bio);
prio = bio_prio(bio);
rw = bio_data_dir(bio); rw = bio_data_dir(bio);
sync = bio_sync(bio); sync = bio_sync(bio);
...@@ -2559,6 +2561,7 @@ static int __make_request(request_queue_t *q, struct bio *bio) ...@@ -2559,6 +2561,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
req->biotail->bi_next = bio; req->biotail->bi_next = bio;
req->biotail = bio; req->biotail = bio;
req->nr_sectors = req->hard_nr_sectors += nr_sectors; req->nr_sectors = req->hard_nr_sectors += nr_sectors;
req->ioprio = ioprio_best(req->ioprio, prio);
drive_stat_acct(req, nr_sectors, 0); drive_stat_acct(req, nr_sectors, 0);
if (!attempt_back_merge(q, req)) if (!attempt_back_merge(q, req))
elv_merged_request(q, req); elv_merged_request(q, req);
...@@ -2583,6 +2586,7 @@ static int __make_request(request_queue_t *q, struct bio *bio) ...@@ -2583,6 +2586,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
req->hard_cur_sectors = cur_nr_sectors; req->hard_cur_sectors = cur_nr_sectors;
req->sector = req->hard_sector = sector; req->sector = req->hard_sector = sector;
req->nr_sectors = req->hard_nr_sectors += nr_sectors; req->nr_sectors = req->hard_nr_sectors += nr_sectors;
req->ioprio = ioprio_best(req->ioprio, prio);
drive_stat_acct(req, nr_sectors, 0); drive_stat_acct(req, nr_sectors, 0);
if (!attempt_front_merge(q, req)) if (!attempt_front_merge(q, req))
elv_merged_request(q, req); elv_merged_request(q, req);
...@@ -2610,7 +2614,7 @@ static int __make_request(request_queue_t *q, struct bio *bio) ...@@ -2610,7 +2614,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
freereq = NULL; freereq = NULL;
} else { } else {
spin_unlock_irq(q->queue_lock); spin_unlock_irq(q->queue_lock);
if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) { if ((freereq = get_request(q, rw, bio, GFP_ATOMIC)) == NULL) {
/* /*
* READA bit set * READA bit set
*/ */
...@@ -2618,7 +2622,7 @@ static int __make_request(request_queue_t *q, struct bio *bio) ...@@ -2618,7 +2622,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
if (bio_rw_ahead(bio)) if (bio_rw_ahead(bio))
goto end_io; goto end_io;
freereq = get_request_wait(q, rw); freereq = get_request_wait(q, rw, bio);
} }
goto again; goto again;
} }
...@@ -2646,6 +2650,7 @@ static int __make_request(request_queue_t *q, struct bio *bio) ...@@ -2646,6 +2650,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
req->buffer = bio_data(bio); /* see ->buffer comment above */ req->buffer = bio_data(bio); /* see ->buffer comment above */
req->waiting = NULL; req->waiting = NULL;
req->bio = req->biotail = bio; req->bio = req->biotail = bio;
req->ioprio = prio;
req->rq_disk = bio->bi_bdev->bd_disk; req->rq_disk = bio->bi_bdev->bd_disk;
req->start_time = jiffies; req->start_time = jiffies;
...@@ -2674,7 +2679,7 @@ static inline void blk_partition_remap(struct bio *bio) ...@@ -2674,7 +2679,7 @@ static inline void blk_partition_remap(struct bio *bio)
if (bdev != bdev->bd_contains) { if (bdev != bdev->bd_contains) {
struct hd_struct *p = bdev->bd_part; struct hd_struct *p = bdev->bd_part;
switch (bio->bi_rw) { switch (bio_data_dir(bio)) {
case READ: case READ:
p->read_sectors += bio_sectors(bio); p->read_sectors += bio_sectors(bio);
p->reads++; p->reads++;
...@@ -2693,6 +2698,7 @@ void blk_finish_queue_drain(request_queue_t *q) ...@@ -2693,6 +2698,7 @@ void blk_finish_queue_drain(request_queue_t *q)
{ {
struct request_list *rl = &q->rq; struct request_list *rl = &q->rq;
struct request *rq; struct request *rq;
int requeued = 0;
spin_lock_irq(q->queue_lock); spin_lock_irq(q->queue_lock);
clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags); clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
...@@ -2701,9 +2707,13 @@ void blk_finish_queue_drain(request_queue_t *q) ...@@ -2701,9 +2707,13 @@ void blk_finish_queue_drain(request_queue_t *q)
rq = list_entry_rq(q->drain_list.next); rq = list_entry_rq(q->drain_list.next);
list_del_init(&rq->queuelist); list_del_init(&rq->queuelist);
__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1); elv_requeue_request(q, rq);
requeued++;
} }
if (requeued)
q->request_fn(q);
spin_unlock_irq(q->queue_lock); spin_unlock_irq(q->queue_lock);
wake_up(&rl->wait[0]); wake_up(&rl->wait[0]);
...@@ -2900,7 +2910,7 @@ void submit_bio(int rw, struct bio *bio) ...@@ -2900,7 +2910,7 @@ void submit_bio(int rw, struct bio *bio)
BIO_BUG_ON(!bio->bi_size); BIO_BUG_ON(!bio->bi_size);
BIO_BUG_ON(!bio->bi_io_vec); BIO_BUG_ON(!bio->bi_io_vec);
bio->bi_rw = rw; bio->bi_rw |= rw;
if (rw & WRITE) if (rw & WRITE)
mod_page_state(pgpgout, count); mod_page_state(pgpgout, count);
else else
...@@ -3257,8 +3267,11 @@ void exit_io_context(void) ...@@ -3257,8 +3267,11 @@ void exit_io_context(void)
struct io_context *ioc; struct io_context *ioc;
local_irq_save(flags); local_irq_save(flags);
task_lock(current);
ioc = current->io_context; ioc = current->io_context;
current->io_context = NULL; current->io_context = NULL;
ioc->task = NULL;
task_unlock(current);
local_irq_restore(flags); local_irq_restore(flags);
if (ioc->aic && ioc->aic->exit) if (ioc->aic && ioc->aic->exit)
...@@ -3293,12 +3306,12 @@ struct io_context *get_io_context(int gfp_flags) ...@@ -3293,12 +3306,12 @@ struct io_context *get_io_context(int gfp_flags)
ret = kmem_cache_alloc(iocontext_cachep, gfp_flags); ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
if (ret) { if (ret) {
atomic_set(&ret->refcount, 1); atomic_set(&ret->refcount, 1);
ret->pid = tsk->pid; ret->task = current;
ret->set_ioprio = NULL;
ret->last_waited = jiffies; /* doesn't matter... */ ret->last_waited = jiffies; /* doesn't matter... */
ret->nr_batch_requests = 0; /* because this is 0 */ ret->nr_batch_requests = 0; /* because this is 0 */
ret->aic = NULL; ret->aic = NULL;
ret->cic = NULL; ret->cic = NULL;
spin_lock_init(&ret->lock);
local_irq_save(flags); local_irq_save(flags);
......
...@@ -10,6 +10,7 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \ ...@@ -10,6 +10,7 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \
ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
ioprio.o
obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_EPOLL) += eventpoll.o
obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_COMPAT) += compat.o
......
/*
* fs/ioprio.c
*
* Copyright (C) 2004 Jens Axboe <axboe@suse.de>
*
* Helper functions for setting/querying io priorities of processes. The
* system calls closely mimmick getpriority/setpriority, see the man page for
* those. The prio argument is a composite of prio class and prio data, where
* the data argument has meaning within that class. The standard scheduling
* classes have 8 distinct prio levels, with 0 being the highest prio and 7
* being the lowest.
*
* IOW, setting BE scheduling class with prio 2 is done ala:
*
* unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
*
* ioprio_set(PRIO_PROCESS, pid, prio);
*
* See also Documentation/block/ioprio.txt
*
*/
#include <linux/kernel.h>
#include <linux/ioprio.h>
#include <linux/blkdev.h>
static int set_task_ioprio(struct task_struct *task, int ioprio)
{
struct io_context *ioc;
if (task->uid != current->euid &&
task->uid != current->uid && !capable(CAP_SYS_NICE))
return -EPERM;
task_lock(task);
task->ioprio = ioprio;
ioc = task->io_context;
if (ioc && ioc->set_ioprio)
ioc->set_ioprio(ioc, ioprio);
task_unlock(task);
return 0;
}
asmlinkage int sys_ioprio_set(int which, int who, int ioprio)
{
int class = IOPRIO_PRIO_CLASS(ioprio);
int data = IOPRIO_PRIO_DATA(ioprio);
struct task_struct *p, *g;
struct user_struct *user;
int ret;
switch (class) {
case IOPRIO_CLASS_RT:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
/* fall through, rt has prio field too */
case IOPRIO_CLASS_BE:
if (data >= IOPRIO_BE_NR || data < 0)
return -EINVAL;
break;
case IOPRIO_CLASS_IDLE:
break;
default:
return -EINVAL;
}
ret = -ESRCH;
read_lock_irq(&tasklist_lock);
switch (which) {
case IOPRIO_WHO_PROCESS:
if (!who)
p = current;
else
p = find_task_by_pid(who);
if (p)
ret = set_task_ioprio(p, ioprio);
break;
case IOPRIO_WHO_PGRP:
if (!who)
who = process_group(current);
do_each_task_pid(who, PIDTYPE_PGID, p) {
ret = set_task_ioprio(p, ioprio);
if (ret)
break;
} while_each_task_pid(who, PIDTYPE_PGID, p);
break;
case IOPRIO_WHO_USER:
if (!who)
user = current->user;
else
user = find_user(who);
if (!user)
break;
do_each_thread(g, p) {
if (p->uid != who)
continue;
ret = set_task_ioprio(p, ioprio);
if (ret)
break;
} while_each_thread(g, p);
if (who)
free_uid(user);
break;
default:
ret = -EINVAL;
}
read_unlock_irq(&tasklist_lock);
return ret;
}
asmlinkage int sys_ioprio_get(int which, int who)
{
struct task_struct *g, *p;
struct user_struct *user;
int ret = -ESRCH;
read_lock_irq(&tasklist_lock);
switch (which) {
case IOPRIO_WHO_PROCESS:
if (!who)
p = current;
else
p = find_task_by_pid(who);
if (p)
ret = p->ioprio;
break;
case IOPRIO_WHO_PGRP:
if (!who)
who = process_group(current);
do_each_task_pid(who, PIDTYPE_PGID, p) {
if (ret == -ESRCH)
ret = p->ioprio;
else
ret = ioprio_best(ret, p->ioprio);
} while_each_task_pid(who, PIDTYPE_PGID, p);
break;
case IOPRIO_WHO_USER:
if (!who)
user = current->user;
else
user = find_user(who);
if (!user)
break;
do_each_thread(g, p) {
if (p->uid != user->uid)
continue;
if (ret == -ESRCH)
ret = p->ioprio;
else
ret = ioprio_best(ret, p->ioprio);
} while_each_thread(g, p);
if (who)
free_uid(user);
break;
default:
ret = -EINVAL;
}
read_unlock_irq(&tasklist_lock);
return ret;
}
...@@ -645,18 +645,22 @@ struct buffer_chunk { ...@@ -645,18 +645,22 @@ struct buffer_chunk {
static void write_chunk(struct buffer_chunk *chunk) { static void write_chunk(struct buffer_chunk *chunk) {
int i; int i;
get_fs_excl();
for (i = 0; i < chunk->nr ; i++) { for (i = 0; i < chunk->nr ; i++) {
submit_logged_buffer(chunk->bh[i]) ; submit_logged_buffer(chunk->bh[i]) ;
} }
chunk->nr = 0; chunk->nr = 0;
put_fs_excl();
} }
static void write_ordered_chunk(struct buffer_chunk *chunk) { static void write_ordered_chunk(struct buffer_chunk *chunk) {
int i; int i;
get_fs_excl();
for (i = 0; i < chunk->nr ; i++) { for (i = 0; i < chunk->nr ; i++) {
submit_ordered_buffer(chunk->bh[i]) ; submit_ordered_buffer(chunk->bh[i]) ;
} }
chunk->nr = 0; chunk->nr = 0;
put_fs_excl();
} }
static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
...@@ -918,6 +922,8 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list ...@@ -918,6 +922,8 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list
return 0 ; return 0 ;
} }
get_fs_excl();
/* before we can put our commit blocks on disk, we have to make sure everyone older than /* before we can put our commit blocks on disk, we have to make sure everyone older than
** us is on disk too ** us is on disk too
*/ */
...@@ -1055,6 +1061,7 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list ...@@ -1055,6 +1061,7 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list
if (retval) if (retval)
reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__); reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__);
put_fs_excl();
return retval; return retval;
} }
...@@ -1251,6 +1258,8 @@ static int flush_journal_list(struct super_block *s, ...@@ -1251,6 +1258,8 @@ static int flush_journal_list(struct super_block *s,
return 0 ; return 0 ;
} }
get_fs_excl();
/* if all the work is already done, get out of here */ /* if all the work is already done, get out of here */
if (atomic_read(&(jl->j_nonzerolen)) <= 0 && if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
atomic_read(&(jl->j_commit_left)) <= 0) { atomic_read(&(jl->j_commit_left)) <= 0) {
...@@ -1450,6 +1459,7 @@ static int flush_journal_list(struct super_block *s, ...@@ -1450,6 +1459,7 @@ static int flush_journal_list(struct super_block *s,
put_journal_list(s, jl); put_journal_list(s, jl);
if (flushall) if (flushall)
up(&journal->j_flush_sem); up(&journal->j_flush_sem);
put_fs_excl();
return err ; return err ;
} }
...@@ -2719,6 +2729,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct sup ...@@ -2719,6 +2729,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct sup
th->t_trans_id = journal->j_trans_id ; th->t_trans_id = journal->j_trans_id ;
unlock_journal(p_s_sb) ; unlock_journal(p_s_sb) ;
INIT_LIST_HEAD (&th->t_list); INIT_LIST_HEAD (&th->t_list);
get_fs_excl();
return 0 ; return 0 ;
out_fail: out_fail:
...@@ -3526,6 +3537,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b ...@@ -3526,6 +3537,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
BUG_ON (th->t_refcount > 1); BUG_ON (th->t_refcount > 1);
BUG_ON (!th->t_trans_id); BUG_ON (!th->t_trans_id);
put_fs_excl();
current->journal_info = th->t_handle_save; current->journal_info = th->t_handle_save;
reiserfs_check_lock_depth(p_s_sb, "journal end"); reiserfs_check_lock_depth(p_s_sb, "journal end");
if (journal->j_len == 0) { if (journal->j_len == 0) {
......
...@@ -294,8 +294,10 @@ ...@@ -294,8 +294,10 @@
#define __NR_add_key 286 #define __NR_add_key 286
#define __NR_request_key 287 #define __NR_request_key 287
#define __NR_keyctl 288 #define __NR_keyctl 288
#define __NR_ioprio_set 289
#define __NR_ioprio_get 290
#define NR_syscalls 289 #define NR_syscalls 291
/* /*
* user-visible error numbers are in the range -1 - -128: see * user-visible error numbers are in the range -1 - -128: see
......
...@@ -263,6 +263,8 @@ ...@@ -263,6 +263,8 @@
#define __NR_add_key 1271 #define __NR_add_key 1271
#define __NR_request_key 1272 #define __NR_request_key 1272
#define __NR_keyctl 1273 #define __NR_keyctl 1273
#define __NR_ioprio_set 1274
#define __NR_ioprio_get 1275
#define __NR_set_zone_reclaim 1276 #define __NR_set_zone_reclaim 1276
#ifdef __KERNEL__ #ifdef __KERNEL__
......
...@@ -277,8 +277,10 @@ ...@@ -277,8 +277,10 @@
#define __NR_request_key 270 #define __NR_request_key 270
#define __NR_keyctl 271 #define __NR_keyctl 271
#define __NR_waitid 272 #define __NR_waitid 272
#define __NR_ioprio_set 273
#define __NR_ioprio_get 274
#define __NR_syscalls 273 #define __NR_syscalls 275
#define __NR(n) #n #define __NR(n) #n
......
...@@ -561,8 +561,12 @@ __SYSCALL(__NR_add_key, sys_add_key) ...@@ -561,8 +561,12 @@ __SYSCALL(__NR_add_key, sys_add_key)
__SYSCALL(__NR_request_key, sys_request_key) __SYSCALL(__NR_request_key, sys_request_key)
#define __NR_keyctl 250 #define __NR_keyctl 250
__SYSCALL(__NR_keyctl, sys_keyctl) __SYSCALL(__NR_keyctl, sys_keyctl)
#define __NR_ioprio_set 251
__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
#define __NR_ioprio_get 252
__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
#define __NR_syscall_max __NR_keyctl #define __NR_syscall_max __NR_ioprio_get
#ifndef __NO_STUBS #ifndef __NO_STUBS
/* user-visible error numbers are in the range -1 - -4095 */ /* user-visible error numbers are in the range -1 - -4095 */
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/mempool.h> #include <linux/mempool.h>
#include <linux/ioprio.h>
/* Platforms may set this to teach the BIO layer about IOMMU hardware. */ /* Platforms may set this to teach the BIO layer about IOMMU hardware. */
#include <asm/io.h> #include <asm/io.h>
...@@ -149,6 +150,19 @@ struct bio { ...@@ -149,6 +150,19 @@ struct bio {
#define BIO_RW_FAILFAST 3 #define BIO_RW_FAILFAST 3
#define BIO_RW_SYNC 4 #define BIO_RW_SYNC 4
/*
* upper 16 bits of bi_rw define the io priority of this bio
*/
#define BIO_PRIO_SHIFT (8 * sizeof(unsigned long) - IOPRIO_BITS)
#define bio_prio(bio) ((bio)->bi_rw >> BIO_PRIO_SHIFT)
#define bio_prio_valid(bio) ioprio_valid(bio_prio(bio))
#define bio_set_prio(bio, prio) do { \
WARN_ON(prio >= (1 << IOPRIO_BITS)); \
(bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1); \
(bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT); \
} while (0)
/* /*
* various member access, note that bio_data should of course not be used * various member access, note that bio_data should of course not be used
* on highmem page vectors * on highmem page vectors
......
...@@ -54,16 +54,23 @@ struct as_io_context { ...@@ -54,16 +54,23 @@ struct as_io_context {
struct cfq_queue; struct cfq_queue;
struct cfq_io_context { struct cfq_io_context {
void (*dtor)(struct cfq_io_context *);
void (*exit)(struct cfq_io_context *);
struct io_context *ioc;
/* /*
* circular list of cfq_io_contexts belonging to a process io context * circular list of cfq_io_contexts belonging to a process io context
*/ */
struct list_head list; struct list_head list;
struct cfq_queue *cfqq; struct cfq_queue *cfqq;
void *key;
struct io_context *ioc;
unsigned long last_end_request;
unsigned long last_queue;
unsigned long ttime_total;
unsigned long ttime_samples;
unsigned long ttime_mean;
void (*dtor)(struct cfq_io_context *);
void (*exit)(struct cfq_io_context *);
}; };
/* /*
...@@ -73,7 +80,9 @@ struct cfq_io_context { ...@@ -73,7 +80,9 @@ struct cfq_io_context {
*/ */
struct io_context { struct io_context {
atomic_t refcount; atomic_t refcount;
pid_t pid; struct task_struct *task;
int (*set_ioprio)(struct io_context *, unsigned int);
/* /*
* For request batching * For request batching
...@@ -81,8 +90,6 @@ struct io_context { ...@@ -81,8 +90,6 @@ struct io_context {
unsigned long last_waited; /* Time last woken after wait for request */ unsigned long last_waited; /* Time last woken after wait for request */
int nr_batch_requests; /* Number of requests left in the batch */ int nr_batch_requests; /* Number of requests left in the batch */
spinlock_t lock;
struct as_io_context *aic; struct as_io_context *aic;
struct cfq_io_context *cic; struct cfq_io_context *cic;
}; };
...@@ -134,6 +141,8 @@ struct request { ...@@ -134,6 +141,8 @@ struct request {
void *elevator_private; void *elevator_private;
unsigned short ioprio;
int rq_status; /* should split this into a few status bits */ int rq_status; /* should split this into a few status bits */
struct gendisk *rq_disk; struct gendisk *rq_disk;
int errors; int errors;
......
...@@ -16,9 +16,9 @@ typedef void (elevator_remove_req_fn) (request_queue_t *, struct request *); ...@@ -16,9 +16,9 @@ typedef void (elevator_remove_req_fn) (request_queue_t *, struct request *);
typedef void (elevator_requeue_req_fn) (request_queue_t *, struct request *); typedef void (elevator_requeue_req_fn) (request_queue_t *, struct request *);
typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *); typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *);
typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *); typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *);
typedef int (elevator_may_queue_fn) (request_queue_t *, int); typedef int (elevator_may_queue_fn) (request_queue_t *, int, struct bio *);
typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int); typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, struct bio *, int);
typedef void (elevator_put_req_fn) (request_queue_t *, struct request *); typedef void (elevator_put_req_fn) (request_queue_t *, struct request *);
typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *); typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *);
...@@ -96,9 +96,9 @@ extern struct request *elv_former_request(request_queue_t *, struct request *); ...@@ -96,9 +96,9 @@ extern struct request *elv_former_request(request_queue_t *, struct request *);
extern struct request *elv_latter_request(request_queue_t *, struct request *); extern struct request *elv_latter_request(request_queue_t *, struct request *);
extern int elv_register_queue(request_queue_t *q); extern int elv_register_queue(request_queue_t *q);
extern void elv_unregister_queue(request_queue_t *q); extern void elv_unregister_queue(request_queue_t *q);
extern int elv_may_queue(request_queue_t *, int); extern int elv_may_queue(request_queue_t *, int, struct bio *);
extern void elv_completed_request(request_queue_t *, struct request *); extern void elv_completed_request(request_queue_t *, struct request *);
extern int elv_set_request(request_queue_t *, struct request *, int); extern int elv_set_request(request_queue_t *, struct request *, struct bio *, int);
extern void elv_put_request(request_queue_t *, struct request *); extern void elv_put_request(request_queue_t *, struct request *);
/* /*
......
...@@ -213,6 +213,7 @@ extern int dir_notify_enable; ...@@ -213,6 +213,7 @@ extern int dir_notify_enable;
#include <linux/radix-tree.h> #include <linux/radix-tree.h>
#include <linux/prio_tree.h> #include <linux/prio_tree.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/sched.h>
#include <asm/atomic.h> #include <asm/atomic.h>
#include <asm/semaphore.h> #include <asm/semaphore.h>
...@@ -822,16 +823,34 @@ enum { ...@@ -822,16 +823,34 @@ enum {
#define vfs_check_frozen(sb, level) \ #define vfs_check_frozen(sb, level) \
wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
static inline void get_fs_excl(void)
{
atomic_inc(&current->fs_excl);
}
static inline void put_fs_excl(void)
{
atomic_dec(&current->fs_excl);
}
static inline int has_fs_excl(void)
{
return atomic_read(&current->fs_excl);
}
/* /*
* Superblock locking. * Superblock locking.
*/ */
static inline void lock_super(struct super_block * sb) static inline void lock_super(struct super_block * sb)
{ {
get_fs_excl();
down(&sb->s_lock); down(&sb->s_lock);
} }
static inline void unlock_super(struct super_block * sb) static inline void unlock_super(struct super_block * sb)
{ {
put_fs_excl();
up(&sb->s_lock); up(&sb->s_lock);
} }
......
...@@ -81,6 +81,7 @@ extern struct group_info init_groups; ...@@ -81,6 +81,7 @@ extern struct group_info init_groups;
.mm = NULL, \ .mm = NULL, \
.active_mm = &init_mm, \ .active_mm = &init_mm, \
.run_list = LIST_HEAD_INIT(tsk.run_list), \ .run_list = LIST_HEAD_INIT(tsk.run_list), \
.ioprio = 0, \
.time_slice = HZ, \ .time_slice = HZ, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \ .tasks = LIST_HEAD_INIT(tsk.tasks), \
.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \
...@@ -110,6 +111,7 @@ extern struct group_info init_groups; ...@@ -110,6 +111,7 @@ extern struct group_info init_groups;
.proc_lock = SPIN_LOCK_UNLOCKED, \ .proc_lock = SPIN_LOCK_UNLOCKED, \
.journal_info = NULL, \ .journal_info = NULL, \
.cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
.fs_excl = ATOMIC_INIT(0), \
} }
......
#ifndef IOPRIO_H
#define IOPRIO_H
#include <linux/sched.h>
/*
* Gives us 8 prio classes with 13-bits of data for each class
*/
#define IOPRIO_BITS (16)
#define IOPRIO_CLASS_SHIFT (13)
#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1)
#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT)
#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK)
#define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE)
/*
* These are the io priority groups as implemented by CFQ. RT is the realtime
* class, it always gets premium service. BE is the best-effort scheduling
* class, the default for any process. IDLE is the idle scheduling class, it
* is only served when no one else is using the disk.
*/
enum {
IOPRIO_CLASS_NONE,
IOPRIO_CLASS_RT,
IOPRIO_CLASS_BE,
IOPRIO_CLASS_IDLE,
};
/*
* 8 best effort priority levels are supported
*/
#define IOPRIO_BE_NR (8)
asmlinkage int sys_ioprio_set(int, int, int);
asmlinkage int sys_ioprio_get(int, int);
enum {
IOPRIO_WHO_PROCESS = 1,
IOPRIO_WHO_PGRP,
IOPRIO_WHO_USER,
};
/*
* if process has set io priority explicitly, use that. if not, convert
* the cpu scheduler nice value to an io priority
*/
#define IOPRIO_NORM (4)
static inline int task_ioprio(struct task_struct *task)
{
WARN_ON(!ioprio_valid(task->ioprio));
return IOPRIO_PRIO_DATA(task->ioprio);
}
static inline int task_nice_ioprio(struct task_struct *task)
{
return (task_nice(task) + 20) / 5;
}
/*
* For inheritance, return the highest of the two given priorities
*/
static inline int ioprio_best(unsigned short aprio, unsigned short bprio)
{
unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
if (!ioprio_valid(aprio))
return bprio;
if (!ioprio_valid(bprio))
return aprio;
if (aclass == IOPRIO_CLASS_NONE)
aclass = IOPRIO_CLASS_BE;
if (bclass == IOPRIO_CLASS_NONE)
bclass = IOPRIO_CLASS_BE;
if (aclass == bclass)
return min(aprio, bprio);
if (aclass > bclass)
return bprio;
else
return aprio;
}
#endif
...@@ -608,6 +608,8 @@ struct task_struct { ...@@ -608,6 +608,8 @@ struct task_struct {
struct list_head run_list; struct list_head run_list;
prio_array_t *array; prio_array_t *array;
unsigned short ioprio;
unsigned long sleep_avg; unsigned long sleep_avg;
unsigned long long timestamp, last_ran; unsigned long long timestamp, last_ran;
unsigned long long sched_time; /* sched_clock time spent running */ unsigned long long sched_time; /* sched_clock time spent running */
...@@ -763,6 +765,7 @@ struct task_struct { ...@@ -763,6 +765,7 @@ struct task_struct {
nodemask_t mems_allowed; nodemask_t mems_allowed;
int cpuset_mems_generation; int cpuset_mems_generation;
#endif #endif
atomic_t fs_excl; /* holding fs exclusive resources */
}; };
static inline pid_t process_group(struct task_struct *tsk) static inline pid_t process_group(struct task_struct *tsk)
...@@ -1112,7 +1115,8 @@ extern void unhash_process(struct task_struct *p); ...@@ -1112,7 +1115,8 @@ extern void unhash_process(struct task_struct *p);
/* /*
* Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
* subscriptions and synchronises with wait4(). Also used in procfs. * subscriptions and synchronises with wait4(). Also used in procfs. Also
* pins the final release of task.io_context.
* *
* Nests both inside and outside of read_lock(&tasklist_lock). * Nests both inside and outside of read_lock(&tasklist_lock).
* It must not be nested with write_lock_irq(&tasklist_lock), * It must not be nested with write_lock_irq(&tasklist_lock),
......
...@@ -14,11 +14,13 @@ extern struct list_head inode_unused; ...@@ -14,11 +14,13 @@ extern struct list_head inode_unused;
* Yes, writeback.h requires sched.h * Yes, writeback.h requires sched.h
* No, sched.h is not included from here. * No, sched.h is not included from here.
*/ */
static inline int current_is_pdflush(void) static inline int task_is_pdflush(struct task_struct *task)
{ {
return current->flags & PF_FLUSHER; return task->flags & PF_FLUSHER;
} }
#define current_is_pdflush() task_is_pdflush(current)
/* /*
* fs/fs-writeback.c * fs/fs-writeback.c
*/ */
......
...@@ -784,6 +784,8 @@ fastcall NORET_TYPE void do_exit(long code) ...@@ -784,6 +784,8 @@ fastcall NORET_TYPE void do_exit(long code)
profile_task_exit(tsk); profile_task_exit(tsk);
WARN_ON(atomic_read(&tsk->fs_excl));
if (unlikely(in_interrupt())) if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!"); panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid)) if (unlikely(!tsk->pid))
......
...@@ -1090,6 +1090,11 @@ static task_t *copy_process(unsigned long clone_flags, ...@@ -1090,6 +1090,11 @@ static task_t *copy_process(unsigned long clone_flags,
spin_unlock(&current->sighand->siglock); spin_unlock(&current->sighand->siglock);
} }
/*
* inherit ioprio
*/
p->ioprio = current->ioprio;
SET_LINKS(p); SET_LINKS(p);
if (unlikely(p->ptrace & PT_PTRACED)) if (unlikely(p->ptrace & PT_PTRACED))
__ptrace_link(p, current->parent); __ptrace_link(p, current->parent);
......
...@@ -3448,15 +3448,7 @@ int task_nice(const task_t *p) ...@@ -3448,15 +3448,7 @@ int task_nice(const task_t *p)
{ {
return TASK_NICE(p); return TASK_NICE(p);
} }
/*
* The only users of task_nice are binfmt_elf and binfmt_elf32.
* binfmt_elf is no longer modular, but binfmt_elf32 still is.
* Therefore, task_nice is needed if there is a compat_mode.
*/
#ifdef CONFIG_COMPAT
EXPORT_SYMBOL_GPL(task_nice); EXPORT_SYMBOL_GPL(task_nice);
#endif
/** /**
* idle_cpu - is a given cpu idle currently? * idle_cpu - is a given cpu idle currently?
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment