Commit c7c22e4d authored by Jens Axboe's avatar Jens Axboe

block: add support for IO CPU affinity

This patch adds support for controlling the IO completion CPU of
either all requests on a queue, or on a per-request basis. We export
a sysfs variable (rq_affinity) which, if set, migrates completions
of requests to the CPU that originally submitted it. A bio helper
(bio_set_completion_cpu()) is also added, so that queuers can ask
for completion on that specific CPU.

In testing, this has been show to cut the system time by as much
as 20-40% on synthetic workloads where CPU affinity is desired.

This requires a little help from the architecture, so it'll only
work as designed for archs that are using the new generic smp
helper infrastructure.
Signed-off-by: default avatarJens Axboe <jens.axboe@oracle.com>
parent 18887ad9
...@@ -110,7 +110,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) ...@@ -110,7 +110,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
memset(rq, 0, sizeof(*rq)); memset(rq, 0, sizeof(*rq));
INIT_LIST_HEAD(&rq->queuelist); INIT_LIST_HEAD(&rq->queuelist);
INIT_LIST_HEAD(&rq->donelist); rq->cpu = -1;
rq->q = q; rq->q = q;
rq->sector = rq->hard_sector = (sector_t) -1; rq->sector = rq->hard_sector = (sector_t) -1;
INIT_HLIST_NODE(&rq->hash); INIT_HLIST_NODE(&rq->hash);
...@@ -322,6 +322,21 @@ void blk_unplug(struct request_queue *q) ...@@ -322,6 +322,21 @@ void blk_unplug(struct request_queue *q)
} }
EXPORT_SYMBOL(blk_unplug); EXPORT_SYMBOL(blk_unplug);
static void blk_invoke_request_fn(struct request_queue *q)
{
/*
* one level of recursion is ok and is much faster than kicking
* the unplug handling
*/
if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
q->request_fn(q);
queue_flag_clear(QUEUE_FLAG_REENTER, q);
} else {
queue_flag_set(QUEUE_FLAG_PLUGGED, q);
kblockd_schedule_work(q, &q->unplug_work);
}
}
/** /**
* blk_start_queue - restart a previously stopped queue * blk_start_queue - restart a previously stopped queue
* @q: The &struct request_queue in question * @q: The &struct request_queue in question
...@@ -336,18 +351,7 @@ void blk_start_queue(struct request_queue *q) ...@@ -336,18 +351,7 @@ void blk_start_queue(struct request_queue *q)
WARN_ON(!irqs_disabled()); WARN_ON(!irqs_disabled());
queue_flag_clear(QUEUE_FLAG_STOPPED, q); queue_flag_clear(QUEUE_FLAG_STOPPED, q);
blk_invoke_request_fn(q);
/*
* one level of recursion is ok and is much faster than kicking
* the unplug handling
*/
if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
q->request_fn(q);
queue_flag_clear(QUEUE_FLAG_REENTER, q);
} else {
blk_plug_device(q);
kblockd_schedule_work(q, &q->unplug_work);
}
} }
EXPORT_SYMBOL(blk_start_queue); EXPORT_SYMBOL(blk_start_queue);
...@@ -405,15 +409,8 @@ void __blk_run_queue(struct request_queue *q) ...@@ -405,15 +409,8 @@ void __blk_run_queue(struct request_queue *q)
* Only recurse once to avoid overrunning the stack, let the unplug * Only recurse once to avoid overrunning the stack, let the unplug
* handling reinvoke the handler shortly if we already got there. * handling reinvoke the handler shortly if we already got there.
*/ */
if (!elv_queue_empty(q)) { if (!elv_queue_empty(q))
if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { blk_invoke_request_fn(q);
q->request_fn(q);
queue_flag_clear(QUEUE_FLAG_REENTER, q);
} else {
blk_plug_device(q);
kblockd_schedule_work(q, &q->unplug_work);
}
}
} }
EXPORT_SYMBOL(__blk_run_queue); EXPORT_SYMBOL(__blk_run_queue);
...@@ -1056,6 +1053,7 @@ EXPORT_SYMBOL(blk_put_request); ...@@ -1056,6 +1053,7 @@ EXPORT_SYMBOL(blk_put_request);
void init_request_from_bio(struct request *req, struct bio *bio) void init_request_from_bio(struct request *req, struct bio *bio)
{ {
req->cpu = bio->bi_comp_cpu;
req->cmd_type = REQ_TYPE_FS; req->cmd_type = REQ_TYPE_FS;
/* /*
...@@ -1198,13 +1196,15 @@ static int __make_request(struct request_queue *q, struct bio *bio) ...@@ -1198,13 +1196,15 @@ static int __make_request(struct request_queue *q, struct bio *bio)
init_request_from_bio(req, bio); init_request_from_bio(req, bio);
spin_lock_irq(q->queue_lock); spin_lock_irq(q->queue_lock);
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
bio_flagged(bio, BIO_CPU_AFFINE))
req->cpu = blk_cpu_to_group(smp_processor_id());
if (elv_queue_empty(q)) if (elv_queue_empty(q))
blk_plug_device(q); blk_plug_device(q);
add_request(q, req); add_request(q, req);
out: out:
if (sync) if (sync)
__generic_unplug_device(q); __generic_unplug_device(q);
spin_unlock_irq(q->queue_lock); spin_unlock_irq(q->queue_lock);
return 0; return 0;
......
...@@ -443,7 +443,7 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask) ...@@ -443,7 +443,7 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
} }
EXPORT_SYMBOL(blk_queue_update_dma_alignment); EXPORT_SYMBOL(blk_queue_update_dma_alignment);
static int __init blk_settings_init(void) int __init blk_settings_init(void)
{ {
blk_max_low_pfn = max_low_pfn - 1; blk_max_low_pfn = max_low_pfn - 1;
blk_max_pfn = max_pfn - 1; blk_max_pfn = max_pfn - 1;
......
...@@ -13,6 +13,70 @@ ...@@ -13,6 +13,70 @@
static DEFINE_PER_CPU(struct list_head, blk_cpu_done); static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
/*
* Softirq action handler - move entries to local list and loop over them
* while passing them to the queue registered handler.
*/
static void blk_done_softirq(struct softirq_action *h)
{
struct list_head *cpu_list, local_list;
local_irq_disable();
cpu_list = &__get_cpu_var(blk_cpu_done);
list_replace_init(cpu_list, &local_list);
local_irq_enable();
while (!list_empty(&local_list)) {
struct request *rq;
rq = list_entry(local_list.next, struct request, csd.list);
list_del_init(&rq->csd.list);
rq->q->softirq_done_fn(rq);
}
}
#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
static void trigger_softirq(void *data)
{
struct request *rq = data;
unsigned long flags;
struct list_head *list;
local_irq_save(flags);
list = &__get_cpu_var(blk_cpu_done);
list_add_tail(&rq->csd.list, list);
if (list->next == &rq->csd.list)
raise_softirq_irqoff(BLOCK_SOFTIRQ);
local_irq_restore(flags);
}
/*
* Setup and invoke a run of 'trigger_softirq' on the given cpu.
*/
static int raise_blk_irq(int cpu, struct request *rq)
{
if (cpu_online(cpu)) {
struct call_single_data *data = &rq->csd;
data->func = trigger_softirq;
data->info = rq;
data->flags = 0;
__smp_call_function_single(cpu, data);
return 0;
}
return 1;
}
#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
static int raise_blk_irq(int cpu, struct request *rq)
{
return 1;
}
#endif
static int __cpuinit blk_cpu_notify(struct notifier_block *self, static int __cpuinit blk_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu) unsigned long action, void *hcpu)
{ {
...@@ -33,33 +97,10 @@ static int __cpuinit blk_cpu_notify(struct notifier_block *self, ...@@ -33,33 +97,10 @@ static int __cpuinit blk_cpu_notify(struct notifier_block *self,
return NOTIFY_OK; return NOTIFY_OK;
} }
static struct notifier_block __cpuinitdata blk_cpu_notifier = {
static struct notifier_block blk_cpu_notifier __cpuinitdata = {
.notifier_call = blk_cpu_notify, .notifier_call = blk_cpu_notify,
}; };
/*
* splice the completion data to a local structure and hand off to
* process_completion_queue() to complete the requests
*/
static void blk_done_softirq(struct softirq_action *h)
{
struct list_head *cpu_list, local_list;
local_irq_disable();
cpu_list = &__get_cpu_var(blk_cpu_done);
list_replace_init(cpu_list, &local_list);
local_irq_enable();
while (!list_empty(&local_list)) {
struct request *rq;
rq = list_entry(local_list.next, struct request, donelist);
list_del_init(&rq->donelist);
rq->q->softirq_done_fn(rq);
}
}
/** /**
* blk_complete_request - end I/O on a request * blk_complete_request - end I/O on a request
* @req: the request being processed * @req: the request being processed
...@@ -71,25 +112,48 @@ static void blk_done_softirq(struct softirq_action *h) ...@@ -71,25 +112,48 @@ static void blk_done_softirq(struct softirq_action *h)
* through a softirq handler. The user must have registered a completion * through a softirq handler. The user must have registered a completion
* callback through blk_queue_softirq_done(). * callback through blk_queue_softirq_done().
**/ **/
void blk_complete_request(struct request *req) void blk_complete_request(struct request *req)
{ {
struct list_head *cpu_list; struct request_queue *q = req->q;
unsigned long flags; unsigned long flags;
int ccpu, cpu, group_cpu;
BUG_ON(!req->q->softirq_done_fn); BUG_ON(!q->softirq_done_fn);
local_irq_save(flags); local_irq_save(flags);
cpu = smp_processor_id();
group_cpu = blk_cpu_to_group(cpu);
cpu_list = &__get_cpu_var(blk_cpu_done); /*
list_add_tail(&req->donelist, cpu_list); * Select completion CPU
raise_softirq_irqoff(BLOCK_SOFTIRQ); */
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
ccpu = req->cpu;
else
ccpu = cpu;
if (ccpu == cpu || ccpu == group_cpu) {
struct list_head *list;
do_local:
list = &__get_cpu_var(blk_cpu_done);
list_add_tail(&req->csd.list, list);
/*
* if the list only contains our just added request,
* signal a raise of the softirq. If there are already
* entries there, someone already raised the irq but it
* hasn't run yet.
*/
if (list->next == &req->csd.list)
raise_softirq_irqoff(BLOCK_SOFTIRQ);
} else if (raise_blk_irq(ccpu, req))
goto do_local;
local_irq_restore(flags); local_irq_restore(flags);
} }
EXPORT_SYMBOL(blk_complete_request); EXPORT_SYMBOL(blk_complete_request);
int __init blk_softirq_init(void) __init int blk_softirq_init(void)
{ {
int i; int i;
......
...@@ -156,6 +156,30 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, ...@@ -156,6 +156,30 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
return ret; return ret;
} }
static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
{
unsigned int set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
return queue_var_show(set != 0, page);
}
static ssize_t
queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
{
ssize_t ret = -EINVAL;
#if defined(CONFIG_USE_GENERIC_SMP_HELPERS)
unsigned long val;
ret = queue_var_store(&val, page, count);
spin_lock_irq(q->queue_lock);
if (val)
queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
else
queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
spin_unlock_irq(q->queue_lock);
#endif
return ret;
}
static struct queue_sysfs_entry queue_requests_entry = { static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
...@@ -197,6 +221,12 @@ static struct queue_sysfs_entry queue_nomerges_entry = { ...@@ -197,6 +221,12 @@ static struct queue_sysfs_entry queue_nomerges_entry = {
.store = queue_nomerges_store, .store = queue_nomerges_store,
}; };
static struct queue_sysfs_entry queue_rq_affinity_entry = {
.attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR },
.show = queue_rq_affinity_show,
.store = queue_rq_affinity_store,
};
static struct attribute *default_attrs[] = { static struct attribute *default_attrs[] = {
&queue_requests_entry.attr, &queue_requests_entry.attr,
&queue_ra_entry.attr, &queue_ra_entry.attr,
...@@ -205,6 +235,7 @@ static struct attribute *default_attrs[] = { ...@@ -205,6 +235,7 @@ static struct attribute *default_attrs[] = {
&queue_iosched_entry.attr, &queue_iosched_entry.attr,
&queue_hw_sector_size_entry.attr, &queue_hw_sector_size_entry.attr,
&queue_nomerges_entry.attr, &queue_nomerges_entry.attr,
&queue_rq_affinity_entry.attr,
NULL, NULL,
}; };
......
...@@ -59,4 +59,16 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) ...@@ -59,4 +59,16 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
#endif /* BLK_DEV_INTEGRITY */ #endif /* BLK_DEV_INTEGRITY */
static inline int blk_cpu_to_group(int cpu)
{
#ifdef CONFIG_SCHED_MC
cpumask_t mask = cpu_coregroup_map(cpu);
return first_cpu(mask);
#elif defined(CONFIG_SCHED_SMT)
return first_cpu(per_cpu(cpu_sibling_map, cpu));
#else
return cpu;
#endif
}
#endif #endif
...@@ -111,6 +111,7 @@ void bio_init(struct bio *bio) ...@@ -111,6 +111,7 @@ void bio_init(struct bio *bio)
{ {
memset(bio, 0, sizeof(*bio)); memset(bio, 0, sizeof(*bio));
bio->bi_flags = 1 << BIO_UPTODATE; bio->bi_flags = 1 << BIO_UPTODATE;
bio->bi_comp_cpu = -1;
atomic_set(&bio->bi_cnt, 1); atomic_set(&bio->bi_cnt, 1);
} }
......
...@@ -81,6 +81,8 @@ struct bio { ...@@ -81,6 +81,8 @@ struct bio {
unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
unsigned int bi_comp_cpu; /* completion CPU */
struct bio_vec *bi_io_vec; /* the actual vec list */ struct bio_vec *bi_io_vec; /* the actual vec list */
bio_end_io_t *bi_end_io; bio_end_io_t *bi_end_io;
...@@ -105,6 +107,7 @@ struct bio { ...@@ -105,6 +107,7 @@ struct bio {
#define BIO_BOUNCED 5 /* bio is a bounce bio */ #define BIO_BOUNCED 5 /* bio is a bounce bio */
#define BIO_USER_MAPPED 6 /* contains user pages */ #define BIO_USER_MAPPED 6 /* contains user pages */
#define BIO_EOPNOTSUPP 7 /* not supported */ #define BIO_EOPNOTSUPP 7 /* not supported */
#define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */
#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
/* /*
...@@ -342,6 +345,14 @@ void zero_fill_bio(struct bio *bio); ...@@ -342,6 +345,14 @@ void zero_fill_bio(struct bio *bio);
extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *);
extern unsigned int bvec_nr_vecs(unsigned short idx); extern unsigned int bvec_nr_vecs(unsigned short idx);
/*
* Allow queuer to specify a completion CPU for this bio
*/
static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
{
bio->bi_comp_cpu = cpu;
}
/* /*
* bio_set is used to allow other portions of the IO system to * bio_set is used to allow other portions of the IO system to
* allocate their own private memory pools for bio and iovec structures. * allocate their own private memory pools for bio and iovec structures.
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/stringify.h> #include <linux/stringify.h>
#include <linux/bsg.h> #include <linux/bsg.h>
#include <linux/smp.h>
#include <asm/scatterlist.h> #include <asm/scatterlist.h>
...@@ -139,7 +140,8 @@ enum rq_flag_bits { ...@@ -139,7 +140,8 @@ enum rq_flag_bits {
*/ */
struct request { struct request {
struct list_head queuelist; struct list_head queuelist;
struct list_head donelist; struct call_single_data csd;
int cpu;
struct request_queue *q; struct request_queue *q;
...@@ -420,6 +422,7 @@ struct request_queue ...@@ -420,6 +422,7 @@ struct request_queue
#define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */ #define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */
#define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */ #define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */
#define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */ #define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */
#define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */
static inline int queue_is_locked(struct request_queue *q) static inline int queue_is_locked(struct request_queue *q)
{ {
......
...@@ -173,15 +173,15 @@ enum { ...@@ -173,15 +173,15 @@ enum {
#define rb_entry_rq(node) rb_entry((node), struct request, rb_node) #define rb_entry_rq(node) rb_entry((node), struct request, rb_node)
/* /*
* Hack to reuse the donelist list_head as the fifo time holder while * Hack to reuse the csd.list list_head as the fifo time holder while
* the request is in the io scheduler. Saves an unsigned long in rq. * the request is in the io scheduler. Saves an unsigned long in rq.
*/ */
#define rq_fifo_time(rq) ((unsigned long) (rq)->donelist.next) #define rq_fifo_time(rq) ((unsigned long) (rq)->csd.list.next)
#define rq_set_fifo_time(rq,exp) ((rq)->donelist.next = (void *) (exp)) #define rq_set_fifo_time(rq,exp) ((rq)->csd.list.next = (void *) (exp))
#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
#define rq_fifo_clear(rq) do { \ #define rq_fifo_clear(rq) do { \
list_del_init(&(rq)->queuelist); \ list_del_init(&(rq)->queuelist); \
INIT_LIST_HEAD(&(rq)->donelist); \ INIT_LIST_HEAD(&(rq)->csd.list); \
} while (0) } while (0)
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment