Commit eba5b46c authored by Jens Axboe's avatar Jens Axboe

[PATCH] block plugging reworked

This patch provides the ability for a block driver to signal it's too
busy to receive more work and temporarily halt the request queue. In
concept it's similar to the networking netif_{start,stop}_queue helpers.

To do this cleanly, I've ripped out the old tq_disk task queue. Instead
an internal list of plugged queues is maintained which will honor the
current queue state (see QUEUE_FLAG_STOPPED bit). Execution of
request_fn has been moved to tasklet context. blk_run_queues() provides
similar functionality to the old run_task_queue(&tq_disk).

Now, this only works at the request_fn level and not at the
make_request_fn level. This is on purpose: drivers working at the
make_request_fn level are essentially providing a piece of the block
level infrastructure themselves. There are basically two reasons for
doing make_request_fn style setups:

o block remappers. start/stop functionality will be done at the target
  device in this case, which is the level that will signal hardware full
  (or continue) anyways.

o drivers who wish to receive single entities of "buffers" and not
  merged requests etc. This could use the start/stop functionality. I'd
  suggest _still_ using a request_fn for these, but set the queue
  options so that no merging etc ever takes place. This has the added
  bonus of providing the usual request depletion throttling at the block
  level.
parent c43626f4
......@@ -281,7 +281,7 @@ static CommandList_struct * cmd_alloc(ctlr_info_t *h, int get_from_pool)
i = find_first_zero_bit(h->cmd_pool_bits, NR_CMDS);
if (i == NR_CMDS)
return NULL;
} while(test_and_set_bit(i & 31, h->cmd_pool_bits+(i/32)) != 0);
} while(test_and_set_bit(i & (BITS_PER_LONG - 1), h->cmd_pool_bits+(i/BITS_PER_LONG)) != 0);
#ifdef CCISS_DEBUG
printk(KERN_DEBUG "cciss: using command buffer %d\n", i);
#endif
......@@ -327,7 +327,7 @@ static void cmd_free(ctlr_info_t *h, CommandList_struct *c, int got_from_pool)
} else
{
i = c - h->cmd_pool;
clear_bit(i%32, h->cmd_pool_bits+(i/32));
clear_bit(i&(BITS_PER_LONG-1), h->cmd_pool_bits+(i/BITS_PER_LONG));
h->nr_frees++;
}
}
......@@ -338,7 +338,7 @@ static void cmd_free(ctlr_info_t *h, CommandList_struct *c, int got_from_pool)
static void cciss_geninit( int ctlr)
{
drive_info_struct *drv;
int i,j;
int i;
/* Loop through each real device */
hba[ctlr]->gendisk.nr_real = 0;
......@@ -1883,6 +1883,7 @@ static void do_cciss_request(request_queue_t *q)
goto queue;
startio:
blk_stop_queue(q);
start_io(h);
}
......@@ -1943,8 +1944,8 @@ static void do_cciss_intr(int irq, void *dev_id, struct pt_regs *regs)
/*
* See if we can queue up some more IO
*/
do_cciss_request(BLK_DEFAULT_QUEUE(MAJOR_NR + h->ctlr));
spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
blk_start_queue(BLK_DEFAULT_QUEUE(MAJOR_NR + h->ctlr));
}
/*
* We cannot read the structure directly, for portablity we must use
......@@ -2448,8 +2449,7 @@ static int __init cciss_init_one(struct pci_dev *pdev,
free_hba(i);
return(-1);
}
hba[i]->cmd_pool_bits = (__u32*)kmalloc(
((NR_CMDS+31)/32)*sizeof(__u32), GFP_KERNEL);
hba[i]->cmd_pool_bits = kmalloc(((NR_CMDS+BITS_PER_LONG-1)/BITS_PER_LONG)*sizeof(unsigned long), GFP_KERNEL);
hba[i]->cmd_pool = (CommandList_struct *)pci_alloc_consistent(
hba[i]->pdev, NR_CMDS * sizeof(CommandList_struct),
&(hba[i]->cmd_pool_dhandle));
......@@ -2484,7 +2484,7 @@ static int __init cciss_init_one(struct pci_dev *pdev,
pci_set_drvdata(pdev, hba[i]);
/* command and error info recs zeroed out before
they are used */
memset(hba[i]->cmd_pool_bits, 0, ((NR_CMDS+31)/32)*sizeof(__u32));
memset(hba[i]->cmd_pool_bits, 0, ((NR_CMDS+BITS_PER_LONG-1)/BITS_PER_LONG)*sizeof(unsigned long));
#ifdef CCISS_DEBUG
printk(KERN_DEBUG "Scanning for drives on controller cciss%d\n",i);
......
......@@ -76,7 +76,7 @@ struct ctlr_info
dma_addr_t cmd_pool_dhandle;
ErrorInfo_struct *errinfo_pool;
dma_addr_t errinfo_pool_dhandle;
__u32 *cmd_pool_bits;
unsigned long *cmd_pool_bits;
int nr_allocs;
int nr_frees;
......
......@@ -166,7 +166,7 @@ static int ida_proc_get_info(char *buffer, char **start, off_t offset,
static void ida_geninit(int ctlr)
{
int i,j;
int i;
drv_info_t *drv;
for(i=0; i<NWD; i++) {
......@@ -409,8 +409,7 @@ int __init cpqarray_init(void)
hba[i]->cmd_pool = (cmdlist_t *)pci_alloc_consistent(
hba[i]->pci_dev, NR_CMDS * sizeof(cmdlist_t),
&(hba[i]->cmd_pool_dhandle));
hba[i]->cmd_pool_bits = (__u32*)kmalloc(
((NR_CMDS+31)/32)*sizeof(__u32), GFP_KERNEL);
hba[i]->cmd_pool_bits = kmalloc(((NR_CMDS+BITS_PER_LONG-1)/BITS_PER_LONG)*sizeof(unsigned long), GFP_KERNEL);
if(hba[i]->cmd_pool_bits == NULL || hba[i]->cmd_pool == NULL)
{
......@@ -441,7 +440,7 @@ int __init cpqarray_init(void)
}
memset(hba[i]->cmd_pool, 0, NR_CMDS * sizeof(cmdlist_t));
memset(hba[i]->cmd_pool_bits, 0, ((NR_CMDS+31)/32)*sizeof(__u32));
memset(hba[i]->cmd_pool_bits, 0, ((NR_CMDS+BITS_PER_LONG-1)/BITS_PER_LONG)*sizeof(unsigned long));
printk(KERN_INFO "cpqarray: Finding drives on %s",
hba[i]->devname);
getgeometry(i);
......@@ -916,6 +915,7 @@ DBGPX( printk("Submitting %d sectors in %d segments\n", creq->nr_sectors, seg);
goto queue_next;
startio:
blk_stop_queue(q);
start_io(h);
}
......@@ -1066,8 +1066,8 @@ static void do_ida_intr(int irq, void *dev_id, struct pt_regs *regs)
/*
* See if we can queue up some more IO
*/
do_ida_request(BLK_DEFAULT_QUEUE(MAJOR_NR + h->ctlr));
spin_unlock_irqrestore(IDA_LOCK(h->ctlr), flags);
blk_start_queue(BLK_DEFAULT_QUEUE(MAJOR_NR + h->ctlr));
}
/*
......@@ -1333,7 +1333,7 @@ static cmdlist_t * cmd_alloc(ctlr_info_t *h, int get_from_pool)
i = find_first_zero_bit(h->cmd_pool_bits, NR_CMDS);
if (i == NR_CMDS)
return NULL;
} while(test_and_set_bit(i%32, h->cmd_pool_bits+(i/32)) != 0);
} while(test_and_set_bit(i&(BITS_PER_LONG-1), h->cmd_pool_bits+(i/BITS_PER_LONG)) != 0);
c = h->cmd_pool + i;
cmd_dhandle = h->cmd_pool_dhandle + i*sizeof(cmdlist_t);
h->nr_allocs++;
......@@ -1353,7 +1353,7 @@ static void cmd_free(ctlr_info_t *h, cmdlist_t *c, int got_from_pool)
c->busaddr);
} else {
i = c - h->cmd_pool;
clear_bit(i%32, h->cmd_pool_bits+(i/32));
clear_bit(i&(BITS_PER_LONG-1), h->cmd_pool_bits+(i/BITS_PER_LONG));
h->nr_frees++;
}
}
......
......@@ -104,7 +104,7 @@ struct ctlr_info {
cmdlist_t *cmpQ;
cmdlist_t *cmd_pool;
dma_addr_t cmd_pool_dhandle;
__u32 *cmd_pool_bits;
unsigned long *cmd_pool_bits;
spinlock_t lock;
unsigned int Qdepth;
......
......@@ -3900,7 +3900,7 @@ static int __floppy_read_block_0(struct block_device *bdev)
bio.bi_end_io = floppy_rb0_complete;
submit_bio(READ, &bio);
run_task_queue(&tq_disk);
generic_unplug_device(bdev_get_queue(bdev));
process_fd_request();
wait_for_completion(&complete);
......
......@@ -49,6 +49,9 @@ extern int mac_floppy_init(void);
*/
static kmem_cache_t *request_cachep;
static struct list_head blk_plug_list;
static spinlock_t blk_plug_lock = SPIN_LOCK_UNLOCKED;
/*
* The "disk" task queue is used to start the actual requests
* after a plug
......@@ -791,8 +794,12 @@ void blk_plug_device(request_queue_t *q)
if (!elv_queue_empty(q))
return;
if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
queue_task(&q->plug_tq, &tq_disk);
if (test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
return;
spin_lock(&blk_plug_lock);
list_add_tail(&q->plug.list, &blk_plug_list);
spin_unlock(&blk_plug_lock);
}
/*
......@@ -803,8 +810,13 @@ static inline void __generic_unplug_device(request_queue_t *q)
/*
* not plugged
*/
if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
if (!__test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
return;
if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
printk("queue was stopped\n");
return;
}
/*
* was plugged, fire request_fn if queue has stuff to do
......@@ -815,7 +827,7 @@ static inline void __generic_unplug_device(request_queue_t *q)
/**
* generic_unplug_device - fire a request queue
* @q: The &request_queue_t in question
* @data: The &request_queue_t in question
*
* Description:
* Linux uses plugging to build bigger requests queues before letting
......@@ -826,6 +838,16 @@ static inline void __generic_unplug_device(request_queue_t *q)
* queue is invoked and transfers started.
**/
void generic_unplug_device(void *data)
{
request_queue_t *q = data;
tasklet_schedule(&q->plug.task);
}
/*
* the plug tasklet
*/
static void blk_task_run(unsigned long data)
{
request_queue_t *q = (request_queue_t *) data;
unsigned long flags;
......@@ -835,6 +857,49 @@ void generic_unplug_device(void *data)
spin_unlock_irqrestore(q->queue_lock, flags);
}
/*
* clear top flag and schedule tasklet for execution
*/
void blk_start_queue(request_queue_t *q)
{
if (test_and_clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
tasklet_enable(&q->plug.task);
tasklet_schedule(&q->plug.task);
}
/*
* set stop bit and disable any pending tasklet
*/
void blk_stop_queue(request_queue_t *q)
{
if (!test_and_set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
tasklet_disable(&q->plug.task);
}
/*
* the equivalent of the previous tq_disk run
*/
void blk_run_queues(void)
{
struct list_head *tmp, *n;
unsigned long flags;
/*
* we could splice to the stack prior to running
*/
spin_lock_irqsave(&blk_plug_lock, flags);
list_for_each_safe(tmp, n, &blk_plug_list) {
request_queue_t *q = list_entry(tmp, request_queue_t,plug.list);
if (!test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
list_del(&q->plug.list);
tasklet_schedule(&q->plug.task);
}
}
spin_unlock_irqrestore(&blk_plug_lock, flags);
}
static int __blk_cleanup_queue(struct request_list *list)
{
struct list_head *head = &list->free;
......@@ -974,9 +1039,6 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock)
q->front_merge_fn = ll_front_merge_fn;
q->merge_requests_fn = ll_merge_requests_fn;
q->prep_rq_fn = NULL;
q->plug_tq.sync = 0;
q->plug_tq.routine = &generic_unplug_device;
q->plug_tq.data = q;
q->queue_flags = (1 << QUEUE_FLAG_CLUSTER);
q->queue_lock = lock;
......@@ -987,6 +1049,10 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock)
blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
INIT_LIST_HEAD(&q->plug.list);
tasklet_init(&q->plug.task, blk_task_run, (unsigned long) q);
return 0;
}
......@@ -1867,6 +1933,8 @@ int __init blk_dev_init(void)
blk_max_low_pfn = max_low_pfn;
blk_max_pfn = max_pfn;
INIT_LIST_HEAD(&blk_plug_list);
#if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_HD)
hd_init();
#endif
......@@ -1910,3 +1978,7 @@ EXPORT_SYMBOL(blk_queue_free_tags);
EXPORT_SYMBOL(blk_queue_start_tag);
EXPORT_SYMBOL(blk_queue_end_tag);
EXPORT_SYMBOL(blk_queue_invalidate_tags);
EXPORT_SYMBOL(blk_start_queue);
EXPORT_SYMBOL(blk_stop_queue);
EXPORT_SYMBOL(blk_run_queues);
......@@ -252,7 +252,9 @@ static unsigned long smart1_completed(ctlr_info_t *h)
outb(CHANNEL_CLEAR, h->ioaddr + SMART1_LOCAL_DOORBELL);
#error Please convert me to Documentation/DMA-mapping.txt
/*
* this is x86 (actually compaq x86) only, so it's ok
*/
if (cmd) ((cmdlist_t*)bus_to_virt(cmd))->req.hdr.rcode = status;
} else {
cmd = 0;
......
......@@ -491,7 +491,7 @@ static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
bio.bi_private = &event;
bio.bi_end_io = bi_complete;
submit_bio(rw, &bio);
run_task_queue(&tq_disk);
blk_run_queues();
wait_for_completion(&event);
return test_bit(BIO_UPTODATE, &bio.bi_flags);
......@@ -2955,7 +2955,7 @@ int md_thread(void * arg)
run = thread->run;
if (run) {
run(thread->data);
run_task_queue(&tq_disk);
blk_run_queues();
}
if (signal_pending(current))
flush_curr_signals();
......@@ -3411,7 +3411,7 @@ int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
last_check = j;
run_task_queue(&tq_disk);
blk_run_queues();
repeat:
if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
......
......@@ -138,7 +138,7 @@ void __wait_on_buffer(struct buffer_head * bh)
get_bh(bh);
add_wait_queue(wq, &wait);
do {
run_task_queue(&tq_disk);
blk_run_queues();
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!buffer_locked(bh))
break;
......@@ -487,7 +487,7 @@ static void free_more_memory(void)
wakeup_bdflush();
try_to_free_pages(zone, GFP_NOFS, 0);
run_task_queue(&tq_disk);
blk_run_queues();
__set_current_state(TASK_RUNNING);
yield();
}
......@@ -1004,7 +1004,7 @@ create_buffers(struct page * page, unsigned long size, int async)
* the reserve list is empty, we're sure there are
* async buffer heads in use.
*/
run_task_queue(&tq_disk);
blk_run_queues();
free_more_memory();
goto try_again;
......@@ -2452,7 +2452,7 @@ EXPORT_SYMBOL(try_to_free_buffers);
int block_sync_page(struct page *page)
{
run_task_queue(&tq_disk);
blk_run_queues();
return 0;
}
......
......@@ -112,7 +112,7 @@ void kiobuf_wait_for_io(struct kiobuf *kiobuf)
repeat:
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (atomic_read(&kiobuf->io_count) != 0) {
run_task_queue(&tq_disk);
blk_run_queues();
schedule();
if (atomic_read(&kiobuf->io_count) != 0)
goto repeat;
......
......@@ -179,7 +179,7 @@ static void __flush_batch(struct buffer_head **bhs, int *batch_count)
spin_unlock(&journal_datalist_lock);
ll_rw_block(WRITE, *batch_count, bhs);
run_task_queue(&tq_disk);
blk_run_queues();
spin_lock(&journal_datalist_lock);
for (i = 0; i < *batch_count; i++) {
struct buffer_head *bh = bhs[i];
......
......@@ -32,7 +32,7 @@ void wait_buffer_until_released (const struct buffer_head * bh)
bh, repeat_counter, buffer_journaled(bh) ? ' ' : '!',
buffer_journal_dirty(bh) ? ' ' : '!');
}
run_task_queue(&tq_disk);
blk_run_queues();
yield();
}
if (repeat_counter > 30000000) {
......
......@@ -8,6 +8,7 @@
#include <linux/list.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
#include <linux/interrupt.h>
#include <asm/scatterlist.h>
......@@ -136,6 +137,11 @@ struct blk_queue_tag {
int max_depth;
};
struct blk_plug {
struct list_head list;
struct tasklet_struct task;
};
/*
* Default nr free requests per queue, ll_rw_blk will scale it down
* according to available RAM at init time
......@@ -177,10 +183,7 @@ struct request_queue
unsigned long bounce_pfn;
int bounce_gfp;
/*
* This is used to remove the plug when tq_disk runs.
*/
struct tq_struct plug_tq;
struct blk_plug plug;
/*
* various queue flags, see QUEUE_* below
......@@ -217,6 +220,7 @@ struct request_queue
#define QUEUE_FLAG_PLUGGED 0 /* queue is plugged */
#define QUEUE_FLAG_CLUSTER 1 /* cluster several segments into 1 */
#define QUEUE_FLAG_QUEUED 2 /* uses generic tag queueing */
#define QUEUE_FLAG_STOPPED 3 /* queue is stopped */
#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
#define blk_mark_plugged(q) set_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
......@@ -303,6 +307,8 @@ extern void blk_recount_segments(request_queue_t *, struct bio *);
extern inline int blk_phys_contig_segment(request_queue_t *q, struct bio *, struct bio *);
extern inline int blk_hw_contig_segment(request_queue_t *q, struct bio *, struct bio *);
extern int block_ioctl(struct block_device *, unsigned int, unsigned long);
extern void blk_start_queue(request_queue_t *q);
extern void blk_stop_queue(request_queue_t *q);
/*
* get ready for proper ref counting
......
......@@ -1079,6 +1079,7 @@ extern int blkdev_get(struct block_device *, mode_t, unsigned, int);
extern int blkdev_put(struct block_device *, int);
extern int bd_claim(struct block_device *, void *);
extern void bd_release(struct block_device *);
extern void blk_run_queues(void);
/* fs/devices.c */
extern const struct block_device_operations *get_blkfops(unsigned int);
......
......@@ -66,7 +66,7 @@ typedef struct list_head task_queue;
#define DECLARE_TASK_QUEUE(q) LIST_HEAD(q)
#define TQ_ACTIVE(q) (!list_empty(&q))
extern task_queue tq_timer, tq_immediate, tq_disk, tq_bdflush;
extern task_queue tq_timer, tq_immediate, tq_bdflush;
/*
* To implement your own list of active bottom halfs, use the following
......
......@@ -337,7 +337,6 @@ EXPORT_SYMBOL(ioctl_by_bdev);
EXPORT_SYMBOL(grok_partitions);
EXPORT_SYMBOL(register_disk);
EXPORT_SYMBOL(read_dev_sector);
EXPORT_SYMBOL(tq_disk);
EXPORT_SYMBOL(init_buffer);
EXPORT_SYMBOL(wipe_partitions);
......
......@@ -220,7 +220,7 @@ void * mempool_alloc(mempool_t *pool, int gfp_mask)
if (gfp_mask == gfp_nowait)
return NULL;
run_task_queue(&tq_disk);
blk_run_queues();
add_wait_queue_exclusive(&pool->wait, &wait);
set_task_state(current, TASK_UNINTERRUPTIBLE);
......
......@@ -148,7 +148,7 @@ static void background_writeout(unsigned long _min_pages)
writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL);
min_pages -= MAX_WRITEBACK_PAGES - nr_to_write;
} while (nr_to_write <= 0);
run_task_queue(&tq_disk);
blk_run_queues();
}
/*
......@@ -206,7 +206,7 @@ static void wb_kupdate(unsigned long arg)
next_jif = start_jif + wb_writeback_jifs;
nr_to_write = ps.nr_dirty;
writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, &oldest_jif);
run_task_queue(&tq_disk);
blk_run_queues();
yield();
if (time_before(next_jif, jiffies + HZ))
......
......@@ -161,7 +161,7 @@ void do_page_cache_readahead(struct file *file,
/*
* Do this now, rather than at the next wait_on_page_locked().
*/
run_task_queue(&tq_disk);
blk_run_queues();
if (!list_empty(&page_pool))
BUG();
......
......@@ -808,7 +808,7 @@ int kswapd(void *unused)
* up on a more timely basis.
*/
kswapd_balance();
run_task_queue(&tq_disk);
blk_run_queues();
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment