ll_rw_blk.c 77.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8
/*
 *  linux/drivers/block/ll_rw_blk.c
 *
 * Copyright (C) 1991, 1992 Linus Torvalds
 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> -  July2000
Linus Torvalds's avatar
Linus Torvalds committed
9
 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
Linus Torvalds's avatar
Linus Torvalds committed
10 11 12 13 14 15
 */

/*
 * This handles all read/write requests to block devices
 */
#include <linux/config.h>
16 17 18
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
19
#include <linux/bio.h>
20
#include <linux/blkdev.h>
21
#include <linux/highmem.h>
Linus Torvalds's avatar
Linus Torvalds committed
22
#include <linux/mm.h>
23 24
#include <linux/kernel_stat.h>
#include <linux/string.h>
Linus Torvalds's avatar
Linus Torvalds committed
25
#include <linux/init.h>
26
#include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
Linus Torvalds's avatar
Linus Torvalds committed
27
#include <linux/completion.h>
Linus Torvalds's avatar
Linus Torvalds committed
28
#include <linux/slab.h>
29
#include <linux/swap.h>
Andrew Morton's avatar
Andrew Morton committed
30
#include <linux/writeback.h>
Linus Torvalds's avatar
Linus Torvalds committed
31

32 33 34 35 36
/*
 * for max sense size
 */
#include <scsi/scsi_cmnd.h>

37
static void blk_unplug_work(void *data);
38
static void blk_unplug_timeout(unsigned long data);
39

Linus Torvalds's avatar
Linus Torvalds committed
40 41 42 43 44
/*
 * For the allocated request tables
 */
static kmem_cache_t *request_cachep;

Jens Axboe's avatar
Jens Axboe committed
45 46 47
/*
 * plug management
 */
Andrew Morton's avatar
Andrew Morton committed
48
static LIST_HEAD(blk_plug_list);
49
static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
Linus Torvalds's avatar
Linus Torvalds committed
50

51 52 53 54
static wait_queue_head_t congestion_wqh[2] = {
		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
	};
55

56 57 58 59 60 61 62
/*
 * Controlling structure to kblockd
 */
static struct workqueue_struct *kblockd_workqueue; 

unsigned long blk_max_low_pfn, blk_max_pfn;

63 64 65
EXPORT_SYMBOL(blk_max_low_pfn);
EXPORT_SYMBOL(blk_max_pfn);

66 67 68 69 70
/* Amount of time in which a process may batch requests */
#define BLK_BATCH_TIME	(HZ/50UL)

/* Number of requests a "batching" process may submit */
#define BLK_BATCH_REQ	32
71

72
/*
73
 * Return the threshold (number of used requests) at which the queue is
74 75 76
 * considered to be congested.  It include a little hysteresis to keep the
 * context switch rate down.
 */
77
static inline int queue_congestion_on_threshold(struct request_queue *q)
78 79 80
{
	int ret;

81 82 83 84 85
	ret = q->nr_requests - (q->nr_requests / 8) + 1;

	if (ret > q->nr_requests)
		ret = q->nr_requests;

86 87 88 89 90 91
	return ret;
}

/*
 * The threshold at which a queue is considered to be uncongested
 */
92
static inline int queue_congestion_off_threshold(struct request_queue *q)
93 94 95
{
	int ret;

96 97 98 99 100
	ret = q->nr_requests - (q->nr_requests / 8) - 1;

	if (ret < 1)
		ret = 1;

101 102 103
	return ret;
}

104 105 106 107 108
/*
 * A queue has just exitted congestion.  Note this in the global counter of
 * congested queues, and wake up anyone who was waiting for requests to be
 * put back.
 */
109 110 111
static void clear_queue_congested(request_queue_t *q, int rw)
{
	enum bdi_state bit;
112
	wait_queue_head_t *wqh = &congestion_wqh[rw];
113 114

	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
115
	clear_bit(bit, &q->backing_dev_info.state);
116
	smp_mb__after_clear_bit();
117 118
	if (waitqueue_active(wqh))
		wake_up(wqh);
119 120
}

121 122 123 124
/*
 * A queue has just entered congestion.  Flag that in the queue's VM-visible
 * state flags and increment the global gounter of congested queues.
 */
125 126 127 128 129
static void set_queue_congested(request_queue_t *q, int rw)
{
	enum bdi_state bit;

	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
130
	set_bit(bit, &q->backing_dev_info.state);
131 132
}

Andrew Morton's avatar
Andrew Morton committed
133
/**
134
 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
135
 * @bdev:	device
Andrew Morton's avatar
Andrew Morton committed
136
 *
Andrew Morton's avatar
Andrew Morton committed
137
 * Locates the passed device's request queue and returns the address of its
138
 * backing_dev_info
Andrew Morton's avatar
Andrew Morton committed
139
 *
Andrew Morton's avatar
Andrew Morton committed
140
 * Will return NULL if the request queue cannot be located.
Andrew Morton's avatar
Andrew Morton committed
141
 */
142
struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
Andrew Morton's avatar
Andrew Morton committed
143
{
144
	struct backing_dev_info *ret = NULL;
145
	request_queue_t *q = bdev_get_queue(bdev);
Andrew Morton's avatar
Andrew Morton committed
146

Andrew Morton's avatar
Andrew Morton committed
147
	if (q)
148
		ret = &q->backing_dev_info;
Andrew Morton's avatar
Andrew Morton committed
149 150 151
	return ret;
}

Jens Axboe's avatar
Jens Axboe committed
152 153 154 155 156 157
void blk_queue_activity_fn(request_queue_t *q, activity_fn *fn, void *data)
{
	q->activity_fn = fn;
	q->activity_data = data;
}

158 159
EXPORT_SYMBOL(blk_queue_activity_fn);

160 161 162 163 164 165 166 167 168 169 170
/**
 * blk_queue_prep_rq - set a prepare_request function for queue
 * @q:		queue
 * @pfn:	prepare_request function
 *
 * It's possible for a queue to register a prepare_request callback which
 * is invoked before the request is handed to the request_fn. The goal of
 * the function is to prepare a request for I/O, it can be used to build a
 * cdb from the request data for instance.
 *
 */
Linus Torvalds's avatar
Linus Torvalds committed
171 172 173 174 175
void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn)
{
	q->prep_rq_fn = pfn;
}

176 177
EXPORT_SYMBOL(blk_queue_prep_rq);

178 179 180 181 182 183 184 185 186 187
/**
 * blk_queue_merge_bvec - set a merge_bvec function for queue
 * @q:		queue
 * @mbfn:	merge_bvec_fn
 *
 * Usually queues have static limitations on the max sectors or segments that
 * we can put in a request. Stacking drivers may have some settings that
 * are dynamic, and thus we have to query the queue whether it is ok to
 * add a new bio_vec to a bio at a given offset or not. If the block device
 * has such limitations, it needs to register a merge_bvec_fn to control
188 189 190 191 192
 * the size of bio's sent to it. Note that a block device *must* allow a
 * single page to be added to an empty bio. The block device driver may want
 * to use the bio_split() function to deal with these bio's. By default
 * no merge_bvec_fn is defined for a queue, and only the fixed limits are
 * honored.
193 194 195 196 197 198
 */
void blk_queue_merge_bvec(request_queue_t *q, merge_bvec_fn *mbfn)
{
	q->merge_bvec_fn = mbfn;
}

199 200
EXPORT_SYMBOL(blk_queue_merge_bvec);

Linus Torvalds's avatar
Linus Torvalds committed
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
/**
 * blk_queue_make_request - define an alternate make_request function for a device
 * @q:  the request queue for the device to be affected
 * @mfn: the alternate make_request function
 *
 * Description:
 *    The normal way for &struct bios to be passed to a device
 *    driver is for them to be collected into requests on a request
 *    queue, and then to allow the device driver to select requests
 *    off that queue when it is ready.  This works well for many block
 *    devices. However some block devices (typically virtual devices
 *    such as md or lvm) do not benefit from the processing on the
 *    request queue, and are served best by having the requests passed
 *    directly to them.  This can be achieved by providing a function
 *    to blk_queue_make_request().
 *
 * Caveat:
 *    The driver that does this *must* be able to deal appropriately
 *    with buffers in "highmemory". This can be accomplished by either calling
Andrew Morton's avatar
Andrew Morton committed
220
 *    __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
Linus Torvalds's avatar
Linus Torvalds committed
221 222 223
 *    blk_queue_bounce() to create a buffer in normal memory.
 **/
void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
Linus Torvalds's avatar
Linus Torvalds committed
224
{
Linus Torvalds's avatar
Linus Torvalds committed
225 226 227
	/*
	 * set defaults
	 */
228
	q->nr_requests = BLKDEV_MAX_RQ;
Linus Torvalds's avatar
Linus Torvalds committed
229 230
	q->max_phys_segments = MAX_PHYS_SEGMENTS;
	q->max_hw_segments = MAX_HW_SEGMENTS;
Linus Torvalds's avatar
Linus Torvalds committed
231
	q->make_request_fn = mfn;
232 233
	q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
	q->backing_dev_info.state = 0;
234
	q->backing_dev_info.memory_backed = 0;
Linus Torvalds's avatar
Linus Torvalds committed
235 236
	blk_queue_max_sectors(q, MAX_SECTORS);
	blk_queue_hardsect_size(q, 512);
Jens Axboe's avatar
Jens Axboe committed
237
	blk_queue_dma_alignment(q, 511);
Linus Torvalds's avatar
Linus Torvalds committed
238

239 240 241 242 243 244 245
	q->unplug_thresh = 4;		/* hmm */
	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
	if (q->unplug_delay == 0)
		q->unplug_delay = 1;

	INIT_WORK(&q->unplug_work, blk_unplug_work, q);

246 247 248
	q->unplug_timer.function = blk_unplug_timeout;
	q->unplug_timer.data = (unsigned long)q;

Linus Torvalds's avatar
Linus Torvalds committed
249 250 251 252 253
	/*
	 * by default assume old behaviour and bounce for any highmem page
	 */
	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);

Neil Brown's avatar
Neil Brown committed
254
	INIT_LIST_HEAD(&q->plug_list);
Jens Axboe's avatar
Jens Axboe committed
255 256

	blk_queue_activity_fn(q, NULL, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
257 258
}

259 260
EXPORT_SYMBOL(blk_queue_make_request);

Linus Torvalds's avatar
Linus Torvalds committed
261
/**
Linus Torvalds's avatar
Linus Torvalds committed
262 263 264
 * blk_queue_bounce_limit - set bounce buffer limit for queue
 * @q:  the request queue for the device
 * @dma_addr:   bus address limit
Linus Torvalds's avatar
Linus Torvalds committed
265 266
 *
 * Description:
Linus Torvalds's avatar
Linus Torvalds committed
267 268 269 270 271
 *    Different hardware can have different requirements as to what pages
 *    it can do I/O directly to. A low level driver can call
 *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
 *    buffers for doing I/O to pages residing above @page. By default
 *    the block layer sets this to the highest numbered "low" memory page.
Linus Torvalds's avatar
Linus Torvalds committed
272
 **/
Linus Torvalds's avatar
Linus Torvalds committed
273
void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr)
Linus Torvalds's avatar
Linus Torvalds committed
274
{
Linus Torvalds's avatar
Linus Torvalds committed
275 276 277
	unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
	unsigned long mb = dma_addr >> 20;
	static request_queue_t *last_q;
Linus Torvalds's avatar
Linus Torvalds committed
278

Linus Torvalds's avatar
Linus Torvalds committed
279 280 281 282 283
	/*
	 * set appropriate bounce gfp mask -- unfortunately we don't have a
	 * full 4GB zone, so we have to resort to low memory for any bounces.
	 * ISA has its own < 16MB zone.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
284 285
	if (bounce_pfn < blk_max_low_pfn) {
		BUG_ON(dma_addr < BLK_BOUNCE_ISA);
Linus Torvalds's avatar
Linus Torvalds committed
286 287 288
		init_emergency_isa_pool();
		q->bounce_gfp = GFP_NOIO | GFP_DMA;
	} else
Andrew Morton's avatar
Andrew Morton committed
289
		q->bounce_gfp = GFP_NOIO;
Linus Torvalds's avatar
Linus Torvalds committed
290

Linus Torvalds's avatar
Linus Torvalds committed
291 292 293 294 295 296 297 298
	/*
	 * keep this for debugging for now...
	 */
	if (dma_addr != BLK_BOUNCE_HIGH && q != last_q) {
		printk("blk: queue %p, ", q);
		if (dma_addr == BLK_BOUNCE_ANY)
			printk("no I/O memory limit\n");
		else
Linus Torvalds's avatar
Linus Torvalds committed
299
			printk("I/O limit %luMb (mask 0x%Lx)\n", mb, (long long) dma_addr);
Linus Torvalds's avatar
Linus Torvalds committed
300
	}
Linus Torvalds's avatar
Linus Torvalds committed
301

Linus Torvalds's avatar
Linus Torvalds committed
302 303
	q->bounce_pfn = bounce_pfn;
	last_q = q;
Linus Torvalds's avatar
Linus Torvalds committed
304 305
}

306
EXPORT_SYMBOL(blk_queue_bounce_limit);
Linus Torvalds's avatar
Linus Torvalds committed
307

Linus Torvalds's avatar
Linus Torvalds committed
308
/**
Linus Torvalds's avatar
Linus Torvalds committed
309 310 311
 * blk_queue_max_sectors - set max sectors for a request for this queue
 * @q:  the request queue for the device
 * @max_sectors:  max sectors in the usual 512b unit
Linus Torvalds's avatar
Linus Torvalds committed
312 313
 *
 * Description:
Linus Torvalds's avatar
Linus Torvalds committed
314 315 316 317 318
 *    Enables a low level driver to set an upper limit on the size of
 *    received requests.
 **/
void blk_queue_max_sectors(request_queue_t *q, unsigned short max_sectors)
{
319 320 321 322 323
	if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
		max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
		printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
	}

Linus Torvalds's avatar
Linus Torvalds committed
324 325 326
	q->max_sectors = max_sectors;
}

327 328
EXPORT_SYMBOL(blk_queue_max_sectors);

Linus Torvalds's avatar
Linus Torvalds committed
329
/**
Linus Torvalds's avatar
Linus Torvalds committed
330
 * blk_queue_max_phys_segments - set max phys segments for a request for this queue
Linus Torvalds's avatar
Linus Torvalds committed
331 332
 * @q:  the request queue for the device
 * @max_segments:  max number of segments
Linus Torvalds's avatar
Linus Torvalds committed
333
 *
Linus Torvalds's avatar
Linus Torvalds committed
334 335
 * Description:
 *    Enables a low level driver to set an upper limit on the number of
Linus Torvalds's avatar
Linus Torvalds committed
336 337
 *    physical data segments in a request.  This would be the largest sized
 *    scatter list the driver could handle.
Linus Torvalds's avatar
Linus Torvalds committed
338
 **/
Linus Torvalds's avatar
Linus Torvalds committed
339
void blk_queue_max_phys_segments(request_queue_t *q, unsigned short max_segments)
Linus Torvalds's avatar
Linus Torvalds committed
340
{
341 342 343 344 345
	if (!max_segments) {
		max_segments = 1;
		printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
	}

Linus Torvalds's avatar
Linus Torvalds committed
346 347 348
	q->max_phys_segments = max_segments;
}

349 350
EXPORT_SYMBOL(blk_queue_max_phys_segments);

Linus Torvalds's avatar
Linus Torvalds committed
351 352 353 354 355 356 357 358 359 360 361 362 363
/**
 * blk_queue_max_hw_segments - set max hw segments for a request for this queue
 * @q:  the request queue for the device
 * @max_segments:  max number of segments
 *
 * Description:
 *    Enables a low level driver to set an upper limit on the number of
 *    hw data segments in a request.  This would be the largest number of
 *    address/length pairs the host adapter can actually give as once
 *    to the device.
 **/
void blk_queue_max_hw_segments(request_queue_t *q, unsigned short max_segments)
{
364 365 366 367 368
	if (!max_segments) {
		max_segments = 1;
		printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
	}

Linus Torvalds's avatar
Linus Torvalds committed
369
	q->max_hw_segments = max_segments;
Linus Torvalds's avatar
Linus Torvalds committed
370 371
}

372 373
EXPORT_SYMBOL(blk_queue_max_hw_segments);

Linus Torvalds's avatar
Linus Torvalds committed
374
/**
Linus Torvalds's avatar
Linus Torvalds committed
375 376 377
 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
 * @q:  the request queue for the device
 * @max_size:  max size of segment in bytes
Linus Torvalds's avatar
Linus Torvalds committed
378 379
 *
 * Description:
Linus Torvalds's avatar
Linus Torvalds committed
380 381 382 383 384
 *    Enables a low level driver to set an upper limit on the size of a
 *    coalesced segment
 **/
void blk_queue_max_segment_size(request_queue_t *q, unsigned int max_size)
{
385 386 387 388 389
	if (max_size < PAGE_CACHE_SIZE) {
		max_size = PAGE_CACHE_SIZE;
		printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
	}

Linus Torvalds's avatar
Linus Torvalds committed
390 391 392
	q->max_segment_size = max_size;
}

393 394
EXPORT_SYMBOL(blk_queue_max_segment_size);

Linus Torvalds's avatar
Linus Torvalds committed
395 396 397 398
/**
 * blk_queue_hardsect_size - set hardware sector size for the queue
 * @q:  the request queue for the device
 * @size:  the hardware sector size, in bytes
Linus Torvalds's avatar
Linus Torvalds committed
399
 *
Linus Torvalds's avatar
Linus Torvalds committed
400 401 402 403 404
 * Description:
 *   This should typically be set to the lowest possible sector size
 *   that the hardware can operate on (possible without reverting to
 *   even internal read-modify-write operations). Usually the default
 *   of 512 covers most hardware.
Linus Torvalds's avatar
Linus Torvalds committed
405
 **/
Linus Torvalds's avatar
Linus Torvalds committed
406 407 408 409
void blk_queue_hardsect_size(request_queue_t *q, unsigned short size)
{
	q->hardsect_size = size;
}
Linus Torvalds's avatar
Linus Torvalds committed
410

411 412
EXPORT_SYMBOL(blk_queue_hardsect_size);

413 414 415 416 417
/*
 * Returns the minimum that is _not_ zero, unless both are zero.
 */
#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))

418 419 420 421 422 423 424
/**
 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
 * @t:	the stacking driver (top)
 * @b:  the underlying device (bottom)
 **/
void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b)
{
425 426 427
	/* zero is "infinity" */
	t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);

428 429 430 431 432 433
	t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
	t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
	t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
	t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
}

434 435
EXPORT_SYMBOL(blk_queue_stack_limits);

Linus Torvalds's avatar
Linus Torvalds committed
436 437 438 439 440 441 442
/**
 * blk_queue_segment_boundary - set boundary rules for segment merging
 * @q:  the request queue for the device
 * @mask:  the memory boundary mask
 **/
void blk_queue_segment_boundary(request_queue_t *q, unsigned long mask)
{
443 444 445 446 447
	if (mask < PAGE_CACHE_SIZE - 1) {
		mask = PAGE_CACHE_SIZE - 1;
		printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
	}

Linus Torvalds's avatar
Linus Torvalds committed
448 449 450
	q->seg_boundary_mask = mask;
}

451 452
EXPORT_SYMBOL(blk_queue_segment_boundary);

Jens Axboe's avatar
Jens Axboe committed
453 454
/**
 * blk_queue_dma_alignment - set dma length and memory alignment
455 456
 * @q:     the request queue for the device
 * @mask:  alignment mask
Jens Axboe's avatar
Jens Axboe committed
457 458 459 460 461 462 463 464 465 466 467
 *
 * description:
 *    set required memory and length aligment for direct dma transactions.
 *    this is used when buiding direct io requests for the queue.
 *
 **/
void blk_queue_dma_alignment(request_queue_t *q, int mask)
{
	q->dma_alignment = mask;
}

468 469
EXPORT_SYMBOL(blk_queue_dma_alignment);

Jens Axboe's avatar
Jens Axboe committed
470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485
/**
 * blk_queue_find_tag - find a request by its tag and queue
 *
 * @q:	 The request queue for the device
 * @tag: The tag of the request
 *
 * Notes:
 *    Should be used when a device returns a tag and you want to match
 *    it with a request.
 *
 *    no locks need be held.
 **/
struct request *blk_queue_find_tag(request_queue_t *q, int tag)
{
	struct blk_queue_tag *bqt = q->queue_tags;

Jens Axboe's avatar
Jens Axboe committed
486
	if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
Jens Axboe's avatar
Jens Axboe committed
487 488 489 490
		return NULL;

	return bqt->tag_index[tag];
}
Jens Axboe's avatar
Jens Axboe committed
491

492 493
EXPORT_SYMBOL(blk_queue_find_tag);

494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509
/**
 * blk_queue_free_tags - release tag maintenance info
 * @q:  the request queue for the device
 *
 *  Notes:
 *    blk_cleanup_queue() will take care of calling this function, if tagging
 *    has been used. So there's usually no need to call this directly, unless
 *    tagging is just being disabled but the queue remains in function.
 **/
void blk_queue_free_tags(request_queue_t *q)
{
	struct blk_queue_tag *bqt = q->queue_tags;

	if (!bqt)
		return;

510 511 512
	if (atomic_dec_and_test(&bqt->refcnt)) {
		BUG_ON(bqt->busy);
		BUG_ON(!list_empty(&bqt->busy_list));
513

514 515
		kfree(bqt->tag_index);
		bqt->tag_index = NULL;
516

517 518 519 520 521
		kfree(bqt->tag_map);
		bqt->tag_map = NULL;

		kfree(bqt);
	}
522 523 524 525 526

	q->queue_tags = NULL;
	q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
}

527 528
EXPORT_SYMBOL(blk_queue_free_tags);

529 530
static int
init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
531 532 533
{
	int bits, i;

534 535 536 537
	if (depth > q->nr_requests * 2) {
		depth = q->nr_requests * 2;
		printk(KERN_ERR "%s: adjusted depth to %d\n",
				__FUNCTION__, depth);
538
	}
539 540 541

	tags->tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
	if (!tags->tag_index)
Jens Axboe's avatar
Jens Axboe committed
542
		goto fail;
543 544 545 546

	bits = (depth / BLK_TAGS_PER_LONG) + 1;
	tags->tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC);
	if (!tags->tag_map)
Jens Axboe's avatar
Jens Axboe committed
547
		goto fail;
548

Jens Axboe's avatar
Jens Axboe committed
549 550
	memset(tags->tag_index, 0, depth * sizeof(struct request *));
	memset(tags->tag_map, 0, bits * sizeof(unsigned long));
551
	tags->max_depth = depth;
Jens Axboe's avatar
Jens Axboe committed
552
	tags->real_max_depth = bits * BITS_PER_LONG;
553 554 555 556 557

	/*
	 * set the upper bits if the depth isn't a multiple of the word size
	 */
	for (i = depth; i < bits * BLK_TAGS_PER_LONG; i++)
558
		__set_bit(i, tags->tag_map);
559

560 561 562
	INIT_LIST_HEAD(&tags->busy_list);
	tags->busy = 0;
	atomic_set(&tags->refcnt, 1);
Jens Axboe's avatar
Jens Axboe committed
563 564 565 566 567 568 569 570 571 572 573
	return 0;
fail:
	kfree(tags->tag_index);
	return -ENOMEM;
}

/**
 * blk_queue_init_tags - initialize the queue tag info
 * @q:  the request queue for the device
 * @depth:  the maximum queue depth supported
 **/
574 575
int blk_queue_init_tags(request_queue_t *q, int depth,
			struct blk_queue_tag *tags)
Jens Axboe's avatar
Jens Axboe committed
576
{
577 578 579 580
	if (!tags) {
		tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
		if (!tags)
			goto fail;
Jens Axboe's avatar
Jens Axboe committed
581

582 583 584 585
		if (init_tag_map(q, tags, depth))
			goto fail;
	} else
		atomic_inc(&tags->refcnt);
Jens Axboe's avatar
Jens Axboe committed
586

587 588 589 590 591 592 593
	/*
	 * assign it, all done
	 */
	q->queue_tags = tags;
	q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
	return 0;
fail:
Jens Axboe's avatar
Jens Axboe committed
594
	kfree(tags);
595 596 597
	return -ENOMEM;
}

598 599
EXPORT_SYMBOL(blk_queue_init_tags);

Jens Axboe's avatar
Jens Axboe committed
600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632
/**
 * blk_queue_resize_tags - change the queueing depth
 * @q:  the request queue for the device
 * @new_depth: the new max command queueing depth
 *
 *  Notes:
 *    Must be called with the queue lock held.
 **/
int blk_queue_resize_tags(request_queue_t *q, int new_depth)
{
	struct blk_queue_tag *bqt = q->queue_tags;
	struct request **tag_index;
	unsigned long *tag_map;
	int bits, max_depth;

	if (!bqt)
		return -ENXIO;

	/*
	 * don't bother sizing down
	 */
	if (new_depth <= bqt->real_max_depth) {
		bqt->max_depth = new_depth;
		return 0;
	}

	/*
	 * save the old state info, so we can copy it back
	 */
	tag_index = bqt->tag_index;
	tag_map = bqt->tag_map;
	max_depth = bqt->real_max_depth;

633
	if (init_tag_map(q, bqt, new_depth))
Jens Axboe's avatar
Jens Axboe committed
634 635 636 637
		return -ENOMEM;

	memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
	bits = max_depth / BLK_TAGS_PER_LONG;
Jens Axboe's avatar
Jens Axboe committed
638
	memcpy(bqt->tag_map, tag_map, bits * sizeof(unsigned long));
Jens Axboe's avatar
Jens Axboe committed
639 640 641 642 643 644

	kfree(tag_index);
	kfree(tag_map);
	return 0;
}

645 646 647
/**
 * blk_queue_end_tag - end tag operations for a request
 * @q:  the request queue for the device
648
 * @rq: the request that has completed
649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665
 *
 *  Description:
 *    Typically called when end_that_request_first() returns 0, meaning
 *    all transfers have been done for a request. It's important to call
 *    this function before end_that_request_last(), as that will put the
 *    request back on the free list thus corrupting the internal tag list.
 *
 *  Notes:
 *   queue lock must be held.
 **/
void blk_queue_end_tag(request_queue_t *q, struct request *rq)
{
	struct blk_queue_tag *bqt = q->queue_tags;
	int tag = rq->tag;

	BUG_ON(tag == -1);

Jens Axboe's avatar
Jens Axboe committed
666
	if (unlikely(tag >= bqt->real_max_depth))
667 668 669 670 671 672 673
		return;

	if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) {
		printk("attempt to clear non-busy tag (%d)\n", tag);
		return;
	}

674
	list_del_init(&rq->queuelist);
675 676 677 678 679 680 681 682 683 684
	rq->flags &= ~REQ_QUEUED;
	rq->tag = -1;

	if (unlikely(bqt->tag_index[tag] == NULL))
		printk("tag %d is missing\n", tag);

	bqt->tag_index[tag] = NULL;
	bqt->busy--;
}

685 686
EXPORT_SYMBOL(blk_queue_end_tag);

687 688 689 690 691 692 693 694
/**
 * blk_queue_start_tag - find a free tag and assign it
 * @q:  the request queue for the device
 * @rq:  the block request that needs tagging
 *
 *  Description:
 *    This can either be used as a stand-alone helper, or possibly be
 *    assigned as the queue &prep_rq_fn (in which case &struct request
Jens Axboe's avatar
Jens Axboe committed
695 696 697 698 699 700
 *    automagically gets a tag assigned). Note that this function
 *    assumes that any type of request can be queued! if this is not
 *    true for your device, you must check the request type before
 *    calling this function.  The request will also be removed from
 *    the request queue, so it's the drivers responsibility to readd
 *    it if it should need to be restarted for some reason.
701 702 703 704 705 706 707 708 709 710
 *
 *  Notes:
 *   queue lock must be held.
 **/
int blk_queue_start_tag(request_queue_t *q, struct request *rq)
{
	struct blk_queue_tag *bqt = q->queue_tags;
	unsigned long *map = bqt->tag_map;
	int tag = 0;

Jens Axboe's avatar
Jens Axboe committed
711 712
	if (unlikely((rq->flags & REQ_QUEUED))) {
		printk(KERN_ERR 
713 714
		       "request %p for device [%s] already tagged %d",
		       rq, rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
Jens Axboe's avatar
Jens Axboe committed
715 716
		BUG();
	}
717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736

	for (map = bqt->tag_map; *map == -1UL; map++) {
		tag += BLK_TAGS_PER_LONG;

		if (tag >= bqt->max_depth)
			return 1;
	}

	tag += ffz(*map);
	__set_bit(tag, bqt->tag_map);

	rq->flags |= REQ_QUEUED;
	rq->tag = tag;
	bqt->tag_index[tag] = rq;
	blkdev_dequeue_request(rq);
	list_add(&rq->queuelist, &bqt->busy_list);
	bqt->busy++;
	return 0;
}

737 738
EXPORT_SYMBOL(blk_queue_start_tag);

739 740 741 742 743 744 745 746 747 748 749 750 751 752 753
/**
 * blk_queue_invalidate_tags - invalidate all pending tags
 * @q:  the request queue for the device
 *
 *  Description:
 *   Hardware conditions may dictate a need to stop all pending requests.
 *   In this case, we will safely clear the block side of the tag queue and
 *   readd all requests to the request queue in the right order.
 *
 *  Notes:
 *   queue lock must be held.
 **/
void blk_queue_invalidate_tags(request_queue_t *q)
{
	struct blk_queue_tag *bqt = q->queue_tags;
754
	struct list_head *tmp, *n;
755 756
	struct request *rq;

757
	list_for_each_safe(tmp, n, &bqt->busy_list) {
758 759
		rq = list_entry_rq(tmp);

760 761
		if (rq->tag == -1) {
			printk("bad tag found on list\n");
762
			list_del_init(&rq->queuelist);
Jens Axboe's avatar
Jens Axboe committed
763
			rq->flags &= ~REQ_QUEUED;
764 765 766
		} else
			blk_queue_end_tag(q, rq);

767
		rq->flags &= ~REQ_STARTED;
Jens Axboe's avatar
Jens Axboe committed
768
		__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
769 770 771
	}
}

772 773
EXPORT_SYMBOL(blk_queue_invalidate_tags);

Martin Dalecki's avatar
Martin Dalecki committed
774 775
static char *rq_flags[] = {
	"REQ_RW",
Jens Axboe's avatar
Jens Axboe committed
776
	"REQ_FAILFAST",
Jens Axboe's avatar
Jens Axboe committed
777 778
	"REQ_SOFTBARRIER",
	"REQ_HARDBARRIER",
Martin Dalecki's avatar
Martin Dalecki committed
779 780 781 782 783 784 785 786
	"REQ_CMD",
	"REQ_NOMERGE",
	"REQ_STARTED",
	"REQ_DONTPREP",
	"REQ_QUEUED",
	"REQ_PC",
	"REQ_BLOCK_PC",
	"REQ_SENSE",
787 788
	"REQ_FAILED",
	"REQ_QUIET",
789
	"REQ_SPECIAL",
790 791 792
	"REQ_DRIVE_CMD",
	"REQ_DRIVE_TASK",
	"REQ_DRIVE_TASKFILE",
Jens Axboe's avatar
Jens Axboe committed
793 794 795 796
	"REQ_PREEMPT",
	"REQ_PM_SUSPEND",
	"REQ_PM_RESUME",
	"REQ_PM_SHUTDOWN",
Martin Dalecki's avatar
Martin Dalecki committed
797
};
Linus Torvalds's avatar
Linus Torvalds committed
798 799 800 801 802

void blk_dump_rq_flags(struct request *rq, char *msg)
{
	int bit;

803 804
	printk("%s: dev %s: flags = ", msg,
		rq->rq_disk ? rq->rq_disk->disk_name : "?");
Linus Torvalds's avatar
Linus Torvalds committed
805 806 807 808 809 810 811
	bit = 0;
	do {
		if (rq->flags & (1 << bit))
			printk("%s ", rq_flags[bit]);
		bit++;
	} while (bit < __REQ_NR_BITS);

812
	printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
Linus Torvalds's avatar
Linus Torvalds committed
813 814
						       rq->nr_sectors,
						       rq->current_nr_sectors);
815 816 817 818 819 820 821 822
	printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);

	if (rq->flags & (REQ_BLOCK_PC | REQ_PC)) {
		printk("cdb: ");
		for (bit = 0; bit < sizeof(rq->cmd); bit++)
			printk("%02x ", rq->cmd[bit]);
		printk("\n");
	}
Linus Torvalds's avatar
Linus Torvalds committed
823 824
}

825 826
EXPORT_SYMBOL(blk_dump_rq_flags);

Linus Torvalds's avatar
Linus Torvalds committed
827 828 829
void blk_recount_segments(request_queue_t *q, struct bio *bio)
{
	struct bio_vec *bv, *bvprv = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
830
	int i, nr_phys_segs, nr_hw_segs, seg_size, cluster;
831
	int high, highprv = 1;
Linus Torvalds's avatar
Linus Torvalds committed
832 833 834 835 836

	if (unlikely(!bio->bi_io_vec))
		return;

	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
Linus Torvalds's avatar
Linus Torvalds committed
837
	seg_size = nr_phys_segs = nr_hw_segs = 0;
Linus Torvalds's avatar
Linus Torvalds committed
838
	bio_for_each_segment(bv, bio, i) {
839 840 841 842 843 844 845 846 847
		/*
		 * the trick here is making sure that a high page is never
		 * considered part of another segment, since that might
		 * change with the bounce page.
		 */
		high = page_to_pfn(bv->bv_page) >= q->bounce_pfn;
		if (high || highprv)
			goto new_hw_segment;
		if (cluster) {
848
			if (seg_size + bv->bv_len > q->max_segment_size)
Linus Torvalds's avatar
Linus Torvalds committed
849
				goto new_segment;
850
			if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
Linus Torvalds's avatar
Linus Torvalds committed
851
				goto new_segment;
852
			if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
Linus Torvalds's avatar
Linus Torvalds committed
853 854 855 856 857 858 859
				goto new_segment;

			seg_size += bv->bv_len;
			bvprv = bv;
			continue;
		}
new_segment:
860 861
		if (!BIOVEC_VIRT_MERGEABLE(bvprv, bv))
new_hw_segment:
862 863 864
			nr_hw_segs++;

		nr_phys_segs++;
Linus Torvalds's avatar
Linus Torvalds committed
865
		bvprv = bv;
Linus Torvalds's avatar
Linus Torvalds committed
866
		seg_size = bv->bv_len;
867
		highprv = high;
Linus Torvalds's avatar
Linus Torvalds committed
868 869
	}

Linus Torvalds's avatar
Linus Torvalds committed
870 871
	bio->bi_phys_segments = nr_phys_segs;
	bio->bi_hw_segments = nr_hw_segs;
Linus Torvalds's avatar
Linus Torvalds committed
872 873 874 875
	bio->bi_flags |= (1 << BIO_SEG_VALID);
}


876
int blk_phys_contig_segment(request_queue_t *q, struct bio *bio,
Linus Torvalds's avatar
Linus Torvalds committed
877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896
				   struct bio *nxt)
{
	if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
		return 0;

	if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
		return 0;
	if (bio->bi_size + nxt->bi_size > q->max_segment_size)
		return 0;

	/*
	 * bio and nxt are contigous in memory, check if the queue allows
	 * these two to be merged into one
	 */
	if (BIO_SEG_BOUNDARY(q, bio, nxt))
		return 1;

	return 0;
}

897 898
EXPORT_SYMBOL(blk_phys_contig_segment);

899
int blk_hw_contig_segment(request_queue_t *q, struct bio *bio,
Linus Torvalds's avatar
Linus Torvalds committed
900
				 struct bio *nxt)
Linus Torvalds's avatar
Linus Torvalds committed
901
{
Linus Torvalds's avatar
Linus Torvalds committed
902 903 904
	if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
		return 0;

Linus Torvalds's avatar
Linus Torvalds committed
905
	if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
Linus Torvalds's avatar
Linus Torvalds committed
906
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
907 908
	if (bio->bi_size + nxt->bi_size > q->max_segment_size)
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
909 910

	/*
Linus Torvalds's avatar
Linus Torvalds committed
911 912
	 * bio and nxt are contigous in memory, check if the queue allows
	 * these two to be merged into one
Linus Torvalds's avatar
Linus Torvalds committed
913
	 */
Linus Torvalds's avatar
Linus Torvalds committed
914
	if (BIO_SEG_BOUNDARY(q, bio, nxt))
Linus Torvalds's avatar
Linus Torvalds committed
915 916 917
		return 1;

	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
918 919
}

920 921
EXPORT_SYMBOL(blk_hw_contig_segment);

Linus Torvalds's avatar
Linus Torvalds committed
922 923
/*
 * map a request to scatterlist, return number of sg entries setup. Caller
Linus Torvalds's avatar
Linus Torvalds committed
924
 * must make sure sg can hold rq->nr_phys_segments entries
Linus Torvalds's avatar
Linus Torvalds committed
925 926
 */
int blk_rq_map_sg(request_queue_t *q, struct request *rq, struct scatterlist *sg)
Linus Torvalds's avatar
Linus Torvalds committed
927
{
Linus Torvalds's avatar
Linus Torvalds committed
928
	struct bio_vec *bvec, *bvprv;
Linus Torvalds's avatar
Linus Torvalds committed
929
	struct bio *bio;
Linus Torvalds's avatar
Linus Torvalds committed
930
	int nsegs, i, cluster;
Linus Torvalds's avatar
Linus Torvalds committed
931 932

	nsegs = 0;
Linus Torvalds's avatar
Linus Torvalds committed
933
	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
Linus Torvalds's avatar
Linus Torvalds committed
934 935 936 937

	/*
	 * for each bio in rq
	 */
Linus Torvalds's avatar
Linus Torvalds committed
938
	bvprv = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
939 940 941 942 943 944 945
	rq_for_each_bio(bio, rq) {
		/*
		 * for each segment in bio
		 */
		bio_for_each_segment(bvec, bio, i) {
			int nbytes = bvec->bv_len;

Linus Torvalds's avatar
Linus Torvalds committed
946
			if (bvprv && cluster) {
Linus Torvalds's avatar
Linus Torvalds committed
947
				if (sg[nsegs - 1].length + nbytes > q->max_segment_size)
Linus Torvalds's avatar
Linus Torvalds committed
948 949
					goto new_segment;

Linus Torvalds's avatar
Linus Torvalds committed
950
				if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
Linus Torvalds's avatar
Linus Torvalds committed
951 952 953
					goto new_segment;
				if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
					goto new_segment;
Linus Torvalds's avatar
Linus Torvalds committed
954 955 956 957

				sg[nsegs - 1].length += nbytes;
			} else {
new_segment:
Linus Torvalds's avatar
Linus Torvalds committed
958
				memset(&sg[nsegs],0,sizeof(struct scatterlist));
Linus Torvalds's avatar
Linus Torvalds committed
959 960 961 962 963 964
				sg[nsegs].page = bvec->bv_page;
				sg[nsegs].length = nbytes;
				sg[nsegs].offset = bvec->bv_offset;

				nsegs++;
			}
Linus Torvalds's avatar
Linus Torvalds committed
965
			bvprv = bvec;
Linus Torvalds's avatar
Linus Torvalds committed
966 967 968 969 970 971
		} /* segments in bio */
	} /* bios in rq */

	return nsegs;
}

972 973
EXPORT_SYMBOL(blk_rq_map_sg);

Linus Torvalds's avatar
Linus Torvalds committed
974 975 976 977
/*
 * the standard queue merge functions, can be overridden with device
 * specific ones if so desired
 */
Linus Torvalds's avatar
Linus Torvalds committed
978 979 980 981

static inline int ll_new_mergeable(request_queue_t *q,
				   struct request *req,
				   struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
982
{
Linus Torvalds's avatar
Linus Torvalds committed
983
	int nr_phys_segs = bio_phys_segments(q, bio);
Linus Torvalds's avatar
Linus Torvalds committed
984

Linus Torvalds's avatar
Linus Torvalds committed
985 986
	if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
		req->flags |= REQ_NOMERGE;
Linus Torvalds's avatar
Linus Torvalds committed
987
		q->last_merge = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
988
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
989
	}
Linus Torvalds's avatar
Linus Torvalds committed
990

Linus Torvalds's avatar
Linus Torvalds committed
991 992 993 994 995 996 997 998 999 1000 1001 1002 1003
	/*
	 * A hw segment is just getting larger, bump just the phys
	 * counter.
	 */
	req->nr_phys_segments += nr_phys_segs;
	return 1;
}

static inline int ll_new_hw_segment(request_queue_t *q,
				    struct request *req,
				    struct bio *bio)
{
	int nr_hw_segs = bio_hw_segments(q, bio);
Linus Torvalds's avatar
Linus Torvalds committed
1004
	int nr_phys_segs = bio_phys_segments(q, bio);
Linus Torvalds's avatar
Linus Torvalds committed
1005

Linus Torvalds's avatar
Linus Torvalds committed
1006 1007
	if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
	    || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
Linus Torvalds's avatar
Linus Torvalds committed
1008
		req->flags |= REQ_NOMERGE;
Linus Torvalds's avatar
Linus Torvalds committed
1009
		q->last_merge = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1010 1011 1012 1013 1014 1015 1016 1017
		return 0;
	}

	/*
	 * This will form the start of a new hw segment.  Bump both
	 * counters.
	 */
	req->nr_hw_segments += nr_hw_segs;
Linus Torvalds's avatar
Linus Torvalds committed
1018
	req->nr_phys_segments += nr_phys_segs;
Linus Torvalds's avatar
Linus Torvalds committed
1019
	return 1;
Linus Torvalds's avatar
Linus Torvalds committed
1020 1021 1022
}

static int ll_back_merge_fn(request_queue_t *q, struct request *req, 
Linus Torvalds's avatar
Linus Torvalds committed
1023
			    struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
1024
{
Linus Torvalds's avatar
Linus Torvalds committed
1025 1026
	if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
		req->flags |= REQ_NOMERGE;
Linus Torvalds's avatar
Linus Torvalds committed
1027
		q->last_merge = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1028
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1029 1030
	}

Linus Torvalds's avatar
Linus Torvalds committed
1031
	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)))
Linus Torvalds's avatar
Linus Torvalds committed
1032 1033 1034
		return ll_new_mergeable(q, req, bio);

	return ll_new_hw_segment(q, req, bio);
Linus Torvalds's avatar
Linus Torvalds committed
1035 1036 1037
}

static int ll_front_merge_fn(request_queue_t *q, struct request *req, 
Linus Torvalds's avatar
Linus Torvalds committed
1038
			     struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
1039
{
Linus Torvalds's avatar
Linus Torvalds committed
1040 1041
	if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
		req->flags |= REQ_NOMERGE;
Linus Torvalds's avatar
Linus Torvalds committed
1042
		q->last_merge = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1043
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1044 1045
	}

Linus Torvalds's avatar
Linus Torvalds committed
1046
	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)))
Linus Torvalds's avatar
Linus Torvalds committed
1047 1048 1049
		return ll_new_mergeable(q, req, bio);

	return ll_new_hw_segment(q, req, bio);
Linus Torvalds's avatar
Linus Torvalds committed
1050 1051 1052
}

static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
Linus Torvalds's avatar
Linus Torvalds committed
1053
				struct request *next)
Linus Torvalds's avatar
Linus Torvalds committed
1054
{
Linus Torvalds's avatar
Linus Torvalds committed
1055
	int total_phys_segments = req->nr_phys_segments +next->nr_phys_segments;
Linus Torvalds's avatar
Linus Torvalds committed
1056
	int total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
Linus Torvalds's avatar
Linus Torvalds committed
1057

Linus Torvalds's avatar
Linus Torvalds committed
1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080
	/*
	 * First check if the either of the requests are re-queued
	 * requests.  Can't merge them if they are.
	 */
	if (req->special || next->special)
		return 0;

	/*
	 * Will it become to large?
	 */
	if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
		return 0;

	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
	if (blk_phys_contig_segment(q, req->biotail, next->bio))
		total_phys_segments--;

	if (total_phys_segments > q->max_phys_segments)
		return 0;

	total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
	if (blk_hw_contig_segment(q, req->biotail, next->bio))
		total_hw_segments--;
1081

Linus Torvalds's avatar
Linus Torvalds committed
1082
	if (total_hw_segments > q->max_hw_segments)
Linus Torvalds's avatar
Linus Torvalds committed
1083 1084
		return 0;

Linus Torvalds's avatar
Linus Torvalds committed
1085 1086 1087
	/* Merge is OK... */
	req->nr_phys_segments = total_phys_segments;
	req->nr_hw_segments = total_hw_segments;
Linus Torvalds's avatar
Linus Torvalds committed
1088 1089 1090 1091 1092 1093 1094 1095
	return 1;
}

/*
 * "plug" the device if there are no outstanding requests: this will
 * force the transfer to start only after we have put all the requests
 * on the list.
 *
Jens Axboe's avatar
Jens Axboe committed
1096 1097
 * This is called with interrupts off and no requests on the queue and
 * with the queue lock held.
Linus Torvalds's avatar
Linus Torvalds committed
1098
 */
Linus Torvalds's avatar
Linus Torvalds committed
1099
void blk_plug_device(request_queue_t *q)
Linus Torvalds's avatar
Linus Torvalds committed
1100
{
1101
	WARN_ON(!irqs_disabled());
Jens Axboe's avatar
Jens Axboe committed
1102 1103 1104 1105 1106 1107 1108

	/*
	 * don't plug a stopped queue, it must be paired with blk_start_queue()
	 * which will restart the queueing
	 */
	if (!blk_queue_plugged(q)
	    && !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
1109 1110
		spin_lock(&blk_plug_lock);
		list_add_tail(&q->plug_list, &blk_plug_list);
1111
		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
1112 1113
		spin_unlock(&blk_plug_lock);
	}
Linus Torvalds's avatar
Linus Torvalds committed
1114 1115
}

1116 1117
EXPORT_SYMBOL(blk_plug_device);

Jens Axboe's avatar
Jens Axboe committed
1118 1119 1120 1121
/*
 * remove the queue from the plugged list, if present. called with
 * queue lock held and interrupts disabled.
 */
1122
int blk_remove_plug(request_queue_t *q)
Jens Axboe's avatar
Jens Axboe committed
1123
{
1124
	WARN_ON(!irqs_disabled());
Jens Axboe's avatar
Jens Axboe committed
1125 1126 1127
	if (blk_queue_plugged(q)) {
		spin_lock(&blk_plug_lock);
		list_del_init(&q->plug_list);
1128
		del_timer(&q->unplug_timer);
Jens Axboe's avatar
Jens Axboe committed
1129 1130 1131 1132 1133 1134 1135
		spin_unlock(&blk_plug_lock);
		return 1;
	}

	return 0;
}

1136 1137
EXPORT_SYMBOL(blk_remove_plug);

Linus Torvalds's avatar
Linus Torvalds committed
1138 1139 1140 1141 1142
/*
 * remove the plug and let it rip..
 */
static inline void __generic_unplug_device(request_queue_t *q)
{
1143
	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
Jens Axboe's avatar
Jens Axboe committed
1144 1145
		return;

1146
	if (!blk_remove_plug(q))
Linus Torvalds's avatar
Linus Torvalds committed
1147 1148 1149 1150 1151
		return;

	/*
	 * was plugged, fire request_fn if queue has stuff to do
	 */
1152
	if (elv_next_request(q))
Linus Torvalds's avatar
Linus Torvalds committed
1153
		q->request_fn(q);
Linus Torvalds's avatar
Linus Torvalds committed
1154 1155
}

Linus Torvalds's avatar
Linus Torvalds committed
1156 1157
/**
 * generic_unplug_device - fire a request queue
Jens Axboe's avatar
Jens Axboe committed
1158
 * @data:    The &request_queue_t in question
Linus Torvalds's avatar
Linus Torvalds committed
1159 1160 1161 1162 1163 1164
 *
 * Description:
 *   Linux uses plugging to build bigger requests queues before letting
 *   the device have at them. If a queue is plugged, the I/O scheduler
 *   is still adding and merging requests on the queue. Once the queue
 *   gets unplugged (either by manually calling this function, or by
1165
 *   calling blk_run_queues()), the request_fn defined for the
Linus Torvalds's avatar
Linus Torvalds committed
1166 1167
 *   queue is invoked and transfers started.
 **/
Linus Torvalds's avatar
Linus Torvalds committed
1168
void generic_unplug_device(void *data)
Jens Axboe's avatar
Jens Axboe committed
1169 1170
{
	request_queue_t *q = data;
Linus Torvalds's avatar
Linus Torvalds committed
1171

Jens Axboe's avatar
Jens Axboe committed
1172
	spin_lock_irq(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1173
	__generic_unplug_device(q);
Jens Axboe's avatar
Jens Axboe committed
1174
	spin_unlock_irq(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1175 1176
}

1177 1178
EXPORT_SYMBOL(generic_unplug_device);

1179 1180
static void blk_unplug_work(void *data)
{
Andrew Morton's avatar
Andrew Morton committed
1181 1182
	request_queue_t *q = data;
	q->unplug_fn(q);
1183 1184 1185 1186 1187 1188
}

static void blk_unplug_timeout(unsigned long data)
{
	request_queue_t *q = (request_queue_t *)data;

1189
	kblockd_schedule_work(&q->unplug_work);
1190 1191
}

1192 1193 1194 1195 1196 1197 1198
/**
 * blk_start_queue - restart a previously stopped queue
 * @q:    The &request_queue_t in question
 *
 * Description:
 *   blk_start_queue() will clear the stop flag on the queue, and call
 *   the request_fn for the queue if it was in a stopped state when
Jens Axboe's avatar
Jens Axboe committed
1199
 *   entered. Also see blk_stop_queue(). Queue lock must be held.
1200
 **/
Jens Axboe's avatar
Jens Axboe committed
1201 1202
void blk_start_queue(request_queue_t *q)
{
Andrew Morton's avatar
Andrew Morton committed
1203
	clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
Jens Axboe's avatar
Jens Axboe committed
1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215

	/*
	 * one level of recursion is ok and is much faster than kicking
	 * the unplug handling
	 */
	if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
		q->request_fn(q);
		clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
	} else {
		blk_plug_device(q);
		schedule_work(&q->unplug_work);
	}
1216 1217
}

1218 1219
EXPORT_SYMBOL(blk_start_queue);

1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231
/**
 * blk_stop_queue - stop a queue
 * @q:    The &request_queue_t in question
 *
 * Description:
 *   The Linux block layer assumes that a block driver will consume all
 *   entries on the request queue when the request_fn strategy is called.
 *   Often this will not happen, because of hardware limitations (queue
 *   depth settings). If a device driver gets a 'queue full' response,
 *   or if it simply chooses not to queue more I/O at one point, it can
 *   call this function to prevent the request_fn from being called until
 *   the driver has signalled it's ready to go again. This happens by calling
Jens Axboe's avatar
Jens Axboe committed
1232
 *   blk_start_queue() to restart queue operations. Queue lock must be held.
1233
 **/
Jens Axboe's avatar
Jens Axboe committed
1234 1235
void blk_stop_queue(request_queue_t *q)
{
Jens Axboe's avatar
Jens Axboe committed
1236 1237
	blk_remove_plug(q);
	set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
Jens Axboe's avatar
Jens Axboe committed
1238 1239
}

1240 1241
EXPORT_SYMBOL(blk_stop_queue);

1242 1243
/**
 * blk_run_queue - run a single device queue
1244
 * @q:	The queue to run
1245
 */
1246
void blk_run_queue(struct request_queue *q)
1247
{
1248 1249 1250
	unsigned long flags;

	spin_lock_irqsave(q->queue_lock, flags);
1251 1252
	blk_remove_plug(q);
	q->request_fn(q);
1253
	spin_unlock_irqrestore(q->queue_lock, flags);
1254 1255
}

1256 1257
EXPORT_SYMBOL(blk_run_queue);

1258 1259 1260 1261 1262 1263 1264 1265
/**
 * blk_run_queues - fire all plugged queues
 *
 * Description:
 *   Start I/O on all plugged queues known to the block layer. Queues that
 *   are currently stopped are ignored. This is equivalent to the older
 *   tq_disk task queue run.
 **/
Jens Axboe's avatar
Jens Axboe committed
1266
#define blk_plug_entry(entry) list_entry((entry), request_queue_t, plug_list)
Jens Axboe's avatar
Jens Axboe committed
1267 1268
void blk_run_queues(void)
{
1269
	LIST_HEAD(local_plug_list);
1270

Jens Axboe's avatar
Jens Axboe committed
1271 1272
	spin_lock_irq(&blk_plug_lock);

Jens Axboe's avatar
Jens Axboe committed
1273
	/*
1274
	 * this will happen fairly often
Jens Axboe's avatar
Jens Axboe committed
1275
	 */
1276 1277
	if (list_empty(&blk_plug_list))
		goto out;
1278

1279
	list_splice_init(&blk_plug_list, &local_plug_list);
Jens Axboe's avatar
Jens Axboe committed
1280 1281 1282
	
	while (!list_empty(&local_plug_list)) {
		request_queue_t *q = blk_plug_entry(local_plug_list.next);
Jens Axboe's avatar
Jens Axboe committed
1283

1284
		spin_unlock_irq(&blk_plug_lock);
Jens Axboe's avatar
Jens Axboe committed
1285
		q->unplug_fn(q);
1286
		spin_lock_irq(&blk_plug_lock);
1287
	}
1288 1289
out:
	spin_unlock_irq(&blk_plug_lock);
Jens Axboe's avatar
Jens Axboe committed
1290 1291
}

1292 1293
EXPORT_SYMBOL(blk_run_queues);

Linus Torvalds's avatar
Linus Torvalds committed
1294 1295 1296 1297 1298
/**
 * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
 * @q:    the request queue to be released
 *
 * Description:
1299 1300 1301 1302 1303 1304
 *     blk_cleanup_queue is the pair to blk_init_queue() or
 *     blk_queue_make_request().  It should be called when a request queue is
 *     being released; typically when a block device is being de-registered.
 *     Currently, its primary task it to free all the &struct request
 *     structures that were allocated to the queue and the queue itself.
 *
1305
 * Caveat:
Linus Torvalds's avatar
Linus Torvalds committed
1306 1307 1308 1309 1310
 *     Hopefully the low level driver will have finished any
 *     outstanding requests first...
 **/
void blk_cleanup_queue(request_queue_t * q)
{
1311
	struct request_list *rl = &q->rq;
Linus Torvalds's avatar
Linus Torvalds committed
1312

1313 1314 1315
	if (!atomic_dec_and_test(&q->refcnt))
		return;

1316 1317
	elevator_exit(q);

1318
	del_timer_sync(&q->unplug_timer);
1319
	kblockd_flush();
1320

1321 1322
	if (rl->rq_pool)
		mempool_destroy(rl->rq_pool);
Linus Torvalds's avatar
Linus Torvalds committed
1323

1324 1325 1326
	if (blk_queue_tagged(q))
		blk_queue_free_tags(q);

1327
	kfree(q);
Linus Torvalds's avatar
Linus Torvalds committed
1328 1329
}

1330 1331
EXPORT_SYMBOL(blk_cleanup_queue);

Linus Torvalds's avatar
Linus Torvalds committed
1332
static int blk_init_free_list(request_queue_t *q)
Linus Torvalds's avatar
Linus Torvalds committed
1333
{
1334
	struct request_list *rl = &q->rq;
Linus Torvalds's avatar
Linus Torvalds committed
1335

1336
	rl->count[READ] = rl->count[WRITE] = 0;
1337 1338
	init_waitqueue_head(&rl->wait[READ]);
	init_waitqueue_head(&rl->wait[WRITE]);
Linus Torvalds's avatar
Linus Torvalds committed
1339

1340
	rl->rq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, request_cachep);
Linus Torvalds's avatar
Linus Torvalds committed
1341

1342 1343
	if (!rl->rq_pool)
		return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
1344

Linus Torvalds's avatar
Linus Torvalds committed
1345
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1346 1347
}

Linus Torvalds's avatar
Linus Torvalds committed
1348
static int __make_request(request_queue_t *, struct bio *);
Linus Torvalds's avatar
Linus Torvalds committed
1349

1350 1351 1352 1353 1354
static elevator_t *chosen_elevator =
#if defined(CONFIG_IOSCHED_AS)
	&iosched_as;
#elif defined(CONFIG_IOSCHED_DEADLINE)
	&iosched_deadline;
1355
#elif defined(CONFIG_IOSCHED_NOOP)
1356
	&elevator_noop;
1357 1358 1359
#else
	NULL;
#error "You must have at least 1 I/O scheduler selected"
1360 1361
#endif

1362
#if defined(CONFIG_IOSCHED_AS) || defined(CONFIG_IOSCHED_DEADLINE) || defined (CONFIG_IOSCHED_NOOP)
1363 1364
static int __init elevator_setup(char *str)
{
1365
#ifdef CONFIG_IOSCHED_DEADLINE
1366 1367
	if (!strcmp(str, "deadline"))
		chosen_elevator = &iosched_deadline;
1368 1369
#endif
#ifdef CONFIG_IOSCHED_AS
1370 1371
	if (!strcmp(str, "as"))
		chosen_elevator = &iosched_as;
1372 1373 1374 1375
#endif
#ifdef CONFIG_IOSCHED_NOOP
	if (!strcmp(str, "noop"))
		chosen_elevator = &elevator_noop;
1376
#endif
1377 1378
	return 1;
}
1379

1380
__setup("elevator=", elevator_setup);
1381
#endif /* CONFIG_IOSCHED_AS || CONFIG_IOSCHED_DEADLINE || CONFIG_IOSCHED_NOOP */
1382

1383 1384 1385 1386 1387 1388 1389 1390
request_queue_t *blk_alloc_queue(int gfp_mask)
{
	request_queue_t *q = kmalloc(sizeof(*q), gfp_mask);

	if (!q)
		return NULL;

	memset(q, 0, sizeof(*q));
1391
	init_timer(&q->unplug_timer);
1392 1393 1394 1395
	atomic_set(&q->refcnt, 1);
	return q;
}

1396 1397
EXPORT_SYMBOL(blk_alloc_queue);

Linus Torvalds's avatar
Linus Torvalds committed
1398 1399 1400 1401
/**
 * blk_init_queue  - prepare a request queue for use with a block device
 * @rfn:  The function to be called to process requests that have been
 *        placed on the queue.
1402
 * @lock: Request queue spin lock
Linus Torvalds's avatar
Linus Torvalds committed
1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418
 *
 * Description:
 *    If a block device wishes to use the standard request handling procedures,
 *    which sorts requests and coalesces adjacent requests, then it must
 *    call blk_init_queue().  The function @rfn will be called when there
 *    are requests on the queue that need to be processed.  If the device
 *    supports plugging, then @rfn may not be called immediately when requests
 *    are available on the queue, but may be called at some time later instead.
 *    Plugged queues are generally unplugged when a buffer belonging to one
 *    of the requests on the queue is needed, or due to memory pressure.
 *
 *    @rfn is not required, or even expected, to remove all requests off the
 *    queue, but only as many as it can handle at a time.  If it does leave
 *    requests on the queue, it is responsible for arranging that the requests
 *    get dealt with eventually.
 *
Linus Torvalds's avatar
Linus Torvalds committed
1419 1420
 *    The queue spin lock must be held while manipulating the requests on the
 *    request queue.
Linus Torvalds's avatar
Linus Torvalds committed
1421
 *
1422 1423 1424
 *    Function returns a pointer to the initialized request queue, or NULL if
 *    it didn't succeed.
 *
Linus Torvalds's avatar
Linus Torvalds committed
1425
 * Note:
Linus Torvalds's avatar
Linus Torvalds committed
1426
 *    blk_init_queue() must be paired with a blk_cleanup_queue() call
Linus Torvalds's avatar
Linus Torvalds committed
1427 1428
 *    when the block device is deactivated (such as at module unload).
 **/
1429
request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
Linus Torvalds's avatar
Linus Torvalds committed
1430
{
1431
	request_queue_t *q;
1432
	static int printed;
Linus Torvalds's avatar
Linus Torvalds committed
1433

1434 1435 1436 1437
	q = blk_alloc_queue(GFP_KERNEL);
	if (!q)
		return NULL;

Linus Torvalds's avatar
Linus Torvalds committed
1438
	if (blk_init_free_list(q))
1439
		goto out_init;
Linus Torvalds's avatar
Linus Torvalds committed
1440

1441 1442
	if (!printed) {
		printed = 1;
1443
		printk("Using %s io scheduler\n", chosen_elevator->elevator_name);
1444 1445
	}

1446 1447
	if (elevator_init(q, chosen_elevator))
		goto out_elv;
Linus Torvalds's avatar
Linus Torvalds committed
1448

Linus Torvalds's avatar
Linus Torvalds committed
1449
	q->request_fn		= rfn;
Linus Torvalds's avatar
Linus Torvalds committed
1450 1451 1452
	q->back_merge_fn       	= ll_back_merge_fn;
	q->front_merge_fn      	= ll_front_merge_fn;
	q->merge_requests_fn	= ll_merge_requests_fn;
Linus Torvalds's avatar
Linus Torvalds committed
1453
	q->prep_rq_fn		= NULL;
Jens Axboe's avatar
Jens Axboe committed
1454
	q->unplug_fn		= generic_unplug_device;
Linus Torvalds's avatar
Linus Torvalds committed
1455
	q->queue_flags		= (1 << QUEUE_FLAG_CLUSTER);
Linus Torvalds's avatar
Linus Torvalds committed
1456
	q->queue_lock		= lock;
Andrew Morton's avatar
Andrew Morton committed
1457

Linus Torvalds's avatar
Linus Torvalds committed
1458 1459
	blk_queue_segment_boundary(q, 0xffffffff);

Linus Torvalds's avatar
Linus Torvalds committed
1460 1461
	blk_queue_make_request(q, __make_request);
	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
Linus Torvalds's avatar
Linus Torvalds committed
1462 1463 1464

	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
Jens Axboe's avatar
Jens Axboe committed
1465

1466 1467 1468 1469 1470 1471 1472 1473
	return q;
out_elv:
	blk_cleanup_queue(q);
out_init:
	kfree(q);
	return NULL;
}

1474 1475
EXPORT_SYMBOL(blk_init_queue);

1476 1477 1478 1479 1480 1481 1482 1483
int blk_get_queue(request_queue_t *q)
{
	if (!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
		atomic_inc(&q->refcnt);
		return 0;
	}

	return 1;
Linus Torvalds's avatar
Linus Torvalds committed
1484 1485
}

1486 1487
EXPORT_SYMBOL(blk_get_queue);

1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507
static inline void blk_free_request(request_queue_t *q, struct request *rq)
{
	elv_put_request(q, rq);
	mempool_free(rq, q->rq.rq_pool);
}

static inline struct request *blk_alloc_request(request_queue_t *q,int gfp_mask)
{
	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);

	if (!rq)
		return NULL;

	if (!elv_set_request(q, rq, gfp_mask))
		return rq;

	mempool_free(rq, q->rq.rq_pool);
	return NULL;
}

1508 1509 1510 1511 1512 1513 1514 1515 1516
/*
 * ioc_batching returns true if the ioc is a valid batching request and
 * should be given priority access to a request.
 */
static inline int ioc_batching(struct io_context *ioc)
{
	if (!ioc)
		return 0;

1517 1518 1519 1520 1521
	/*
	 * Make sure the process is able to allocate at least 1 request
	 * even if the batch times out, otherwise we could theoretically
	 * lose wakeups.
	 */
1522 1523 1524 1525 1526 1527
	return ioc->nr_batch_requests == BLK_BATCH_REQ ||
		(ioc->nr_batch_requests > 0
		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
}

/*
1528 1529 1530 1531
 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
 * will cause the process to be a "batcher" on all queues in the system. This
 * is the behaviour we want though - once it gets a wakeup it should be given
 * a nice run.
1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560
 */
void ioc_set_batching(struct io_context *ioc)
{
	if (!ioc || ioc_batching(ioc))
		return;

	ioc->nr_batch_requests = BLK_BATCH_REQ;
	ioc->last_waited = jiffies;
}

/*
 * A request has just been released.  Account for it, update the full and
 * congestion status, wake up any waiters.   Called under q->queue_lock.
 */
static void freed_request(request_queue_t *q, int rw)
{
	struct request_list *rl = &q->rq;

	rl->count[rw]--;
	if (rl->count[rw] < queue_congestion_off_threshold(q))
		clear_queue_congested(q, rw);
	if (rl->count[rw]+1 <= q->nr_requests) {
		if (waitqueue_active(&rl->wait[rw]))
			wake_up(&rl->wait[rw]);
		if (!waitqueue_active(&rl->wait[rw]))
			blk_clear_queue_full(q, rw);
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
1561
#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
Linus Torvalds's avatar
Linus Torvalds committed
1562
/*
1563
 * Get a free request, queue_lock must not be held
Linus Torvalds's avatar
Linus Torvalds committed
1564
 */
1565
static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
1566
{
Linus Torvalds's avatar
Linus Torvalds committed
1567
	struct request *rq = NULL;
1568
	struct request_list *rl = &q->rq;
Andrew Morton's avatar
Andrew Morton committed
1569
	struct io_context *ioc = get_io_context(gfp_mask);
1570 1571

	spin_lock_irq(q->queue_lock);
1572
	if (rl->count[rw]+1 >= q->nr_requests) {
1573 1574 1575 1576 1577 1578
		/*
		 * The queue will fill after this allocation, so set it as
		 * full, and mark this process as "batching". This process
		 * will be allowed to complete a batch of requests, others
		 * will be blocked.
		 */
1579 1580 1581 1582 1583
		if (!blk_queue_full(q, rw)) {
			ioc_set_batching(ioc);
			blk_set_queue_full(q, rw);
		}
	}
Andrew Morton's avatar
Andrew Morton committed
1584

1585 1586
	if (blk_queue_full(q, rw)
			&& !ioc_batching(ioc) && !elv_may_queue(q, rw)) {
1587 1588 1589 1590
		/*
		 * The queue is full and the allocating process is not a
		 * "batcher", and not exempted by the IO scheduler
		 */
1591 1592
		spin_unlock_irq(q->queue_lock);
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1593
	}
1594

1595
	rl->count[rw]++;
1596
	if (rl->count[rw] >= queue_congestion_on_threshold(q))
1597 1598
		set_queue_congested(q, rw);
	spin_unlock_irq(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1599

1600 1601
	rq = blk_alloc_request(q, gfp_mask);
	if (!rq) {
1602 1603 1604 1605 1606 1607 1608
		/*
		 * Allocation failed presumably due to memory. Undo anything
		 * we might have messed up.
		 *
		 * Allocating task should really be put onto the front of the
		 * wait queue, but this is pretty rare.
		 */
1609
		spin_lock_irq(q->queue_lock);
1610
		freed_request(q, rw);
1611 1612 1613
		spin_unlock_irq(q->queue_lock);
		goto out;
	}
1614 1615 1616

	if (ioc_batching(ioc))
		ioc->nr_batch_requests--;
1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634
	
	INIT_LIST_HEAD(&rq->queuelist);

	/*
	 * first three bits are identical in rq->flags and bio->bi_rw,
	 * see bio.h and blkdev.h
	 */
	rq->flags = rw;

	rq->errors = 0;
	rq->rq_status = RQ_ACTIVE;
	rq->bio = rq->biotail = NULL;
	rq->buffer = NULL;
	rq->ref_count = 1;
	rq->q = q;
	rq->rl = rl;
	rq->waiting = NULL;
	rq->special = NULL;
1635
	rq->data_len = 0;
1636 1637 1638 1639
	rq->data = NULL;
	rq->sense = NULL;

out:
1640
	put_io_context(ioc);
Linus Torvalds's avatar
Linus Torvalds committed
1641 1642 1643 1644
	return rq;
}

/*
1645 1646
 * No available requests for this queue, unplug the device and wait for some
 * requests to become available.
Linus Torvalds's avatar
Linus Torvalds committed
1647
 */
Linus Torvalds's avatar
Linus Torvalds committed
1648
static struct request *get_request_wait(request_queue_t *q, int rw)
Linus Torvalds's avatar
Linus Torvalds committed
1649
{
1650
	DEFINE_WAIT(wait);
Linus Torvalds's avatar
Linus Torvalds committed
1651 1652
	struct request *rq;

Linus Torvalds's avatar
Linus Torvalds committed
1653 1654
	generic_unplug_device(q);
	do {
1655
		struct request_list *rl = &q->rq;
1656

1657 1658
		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
				TASK_UNINTERRUPTIBLE);
1659

1660
		rq = get_request(q, rw, GFP_NOIO);
1661 1662

		if (!rq) {
1663 1664
			struct io_context *ioc;

1665
			io_schedule();
1666 1667 1668 1669 1670 1671 1672

			/*
			 * After sleeping, we become a "batching" process and
			 * will be able to allocate at least one request, and
			 * up to a big batch of them for a small period time.
			 * See ioc_batching, ioc_set_batching
			 */
Andrew Morton's avatar
Andrew Morton committed
1673
			ioc = get_io_context(GFP_NOIO);
1674 1675
			ioc_set_batching(ioc);
			put_io_context(ioc);
1676
		}
1677
		finish_wait(&rl->wait[rw], &wait);
1678
	} while (!rq);
1679

Linus Torvalds's avatar
Linus Torvalds committed
1680 1681 1682
	return rq;
}

Linus Torvalds's avatar
Linus Torvalds committed
1683 1684 1685 1686 1687 1688
struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask)
{
	struct request *rq;

	BUG_ON(rw != READ && rw != WRITE);

1689
	if (gfp_mask & __GFP_WAIT)
Linus Torvalds's avatar
Linus Torvalds committed
1690
		rq = get_request_wait(q, rw);
1691
	else
1692
		rq = get_request(q, rw, gfp_mask);
Linus Torvalds's avatar
Linus Torvalds committed
1693

Martin Dalecki's avatar
Martin Dalecki committed
1694 1695
	return rq;
}
1696 1697 1698

EXPORT_SYMBOL(blk_get_request);

1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715
/**
 * blk_requeue_request - put a request back on queue
 * @q:		request queue where request should be inserted
 * @rq:		request to be inserted
 *
 * Description:
 *    Drivers often keep queueing requests until the hardware cannot accept
 *    more, when that condition happens we need to put the request back
 *    on the queue. Must be called with queue lock held.
 */
void blk_requeue_request(request_queue_t *q, struct request *rq)
{
	if (blk_rq_tagged(rq))
		blk_queue_end_tag(q, rq);

	elv_requeue_request(q, rq);
}
Martin Dalecki's avatar
Martin Dalecki committed
1716

1717 1718
EXPORT_SYMBOL(blk_requeue_request);

1719 1720 1721 1722 1723 1724
/**
 * blk_insert_request - insert a special request in to a request queue
 * @q:		request queue where request should be inserted
 * @rq:		request to be inserted
 * @at_head:	insert request at head or tail of queue
 * @data:	private data
1725
 * @reinsert:	true if request it a reinsertion of previously processed one
1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739
 *
 * Description:
 *    Many block devices need to execute commands asynchronously, so they don't
 *    block the whole kernel from preemption during request execution.  This is
 *    accomplished normally by inserting aritficial requests tagged as
 *    REQ_SPECIAL in to the corresponding request queue, and letting them be
 *    scheduled for actual execution by the request queue.
 *
 *    We have the option of inserting the head or the tail of the queue.
 *    Typically we use the tail for new ioctls and so forth.  We use the head
 *    of the queue for things like a QUEUE_FULL message from a device, or a
 *    host that is unable to accept a particular command.
 */
void blk_insert_request(request_queue_t *q, struct request *rq,
1740
			int at_head, void *data, int reinsert)
Linus Torvalds's avatar
Linus Torvalds committed
1741
{
1742 1743 1744 1745 1746 1747 1748
	unsigned long flags;

	/*
	 * tell I/O scheduler that this isn't a regular read/write (ie it
	 * must not attempt merges on this) and that it acts as a soft
	 * barrier
	 */
Jens Axboe's avatar
Jens Axboe committed
1749
	rq->flags |= REQ_SPECIAL | REQ_SOFTBARRIER;
1750 1751 1752 1753

	rq->special = data;

	spin_lock_irqsave(q->queue_lock, flags);
1754 1755 1756 1757

	/*
	 * If command is tagged, release the tag
	 */
1758
	if (reinsert)
1759
		blk_requeue_request(q, rq);
1760
	else {
Jens Axboe's avatar
Jens Axboe committed
1761 1762 1763 1764 1765
		int where = ELEVATOR_INSERT_BACK;

		if (at_head)
			where = ELEVATOR_INSERT_FRONT;

1766 1767
		if (blk_rq_tagged(rq))
			blk_queue_end_tag(q, rq);
1768

1769
		drive_stat_acct(rq, rq->nr_sectors, 1);
Jens Axboe's avatar
Jens Axboe committed
1770
		__elv_add_request(q, rq, where, 0);
1771
	}
1772 1773 1774 1775
	if (blk_queue_plugged(q))
		__generic_unplug_device(q);
	else
		q->request_fn(q);
1776
	spin_unlock_irqrestore(q->queue_lock, flags);
Linus Torvalds's avatar
Linus Torvalds committed
1777 1778
}

1779 1780
EXPORT_SYMBOL(blk_insert_request);

1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918
/**
 * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
 * @q:		request queue where request should be inserted
 * @rw:		READ or WRITE data
 * @ubuf:	the user buffer
 * @len:	length of user data
 *
 * Description:
 *    Data will be mapped directly for zero copy io, if possible. Otherwise
 *    a kernel bounce buffer is used.
 *
 *    A matching blk_rq_unmap_user() must be issued at the end of io, while
 *    still in process context.
 */
struct request *blk_rq_map_user(request_queue_t *q, int rw, void __user *ubuf,
				unsigned int len)
{
	struct request *rq = NULL;
	char *buf = NULL;
	struct bio *bio;
	int ret;

	rq = blk_get_request(q, rw, __GFP_WAIT);
	if (!rq)
		return ERR_PTR(-ENOMEM);

	bio = bio_map_user(q, NULL, (unsigned long) ubuf, len, rw == READ);
	if (!bio) {
		int bytes = (len + 511) & ~511;

		buf = kmalloc(bytes, q->bounce_gfp | GFP_USER);
		if (!buf) {
			ret = -ENOMEM;
			goto fault;
		}

		if (rw == WRITE) {
			if (copy_from_user(buf, ubuf, len)) {
				ret = -EFAULT;
				goto fault;
			}
		} else
			memset(buf, 0, len);
	}

	rq->bio = rq->biotail = bio;
	if (rq->bio)
		blk_rq_bio_prep(q, rq, bio);

	rq->buffer = rq->data = buf;
	rq->data_len = len;
	return rq;
fault:
	if (buf)
		kfree(buf);
	if (bio)
		bio_unmap_user(bio, 1);
	if (rq)
		blk_put_request(rq);

	return ERR_PTR(ret);
}

EXPORT_SYMBOL(blk_rq_map_user);

/**
 * blk_rq_unmap_user - unmap a request with user data
 * @rq:		request to be unmapped
 * @ubuf:	user buffer
 * @ulen:	length of user buffer
 *
 * Description:
 *    Unmap a request previously mapped by blk_rq_map_user().
 */
int blk_rq_unmap_user(struct request *rq, void __user *ubuf, unsigned int ulen)
{
	const int read = rq_data_dir(rq) == READ;
	int ret = 0;

	if (rq->biotail)
		bio_unmap_user(rq->biotail, read);
	if (rq->buffer) {
		if (read && copy_to_user(ubuf, rq->buffer, ulen))
			ret = -EFAULT;
		kfree(rq->buffer);
	}

	blk_put_request(rq);
	return ret;
}

EXPORT_SYMBOL(blk_rq_unmap_user);

/**
 * blk_execute_rq - insert a request into queue for execution
 * @q:		queue to insert the request in
 * @bd_disk:	matching gendisk
 * @rq:		request to insert
 *
 * Description:
 *    Insert a fully prepared request at the back of the io scheduler queue
 *    for execution.
 */
int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
		   struct request *rq)
{
	DECLARE_COMPLETION(wait);
	char sense[SCSI_SENSE_BUFFERSIZE];
	int err = 0;

	rq->rq_disk = bd_disk;

	/*
	 * we need an extra reference to the request, so we can look at
	 * it after io completion
	 */
	rq->ref_count++;

	if (!rq->sense) {
		memset(sense, 0, sizeof(sense));
		rq->sense = sense;
		rq->sense_len = 0;
	}

	rq->flags |= REQ_NOMERGE;
	rq->waiting = &wait;
	elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
	generic_unplug_device(q);
	wait_for_completion(&wait);

	if (rq->errors)
		err = -EIO;

	return err;
}

EXPORT_SYMBOL(blk_execute_rq);

Linus Torvalds's avatar
Linus Torvalds committed
1919
void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
Linus Torvalds's avatar
Linus Torvalds committed
1920
{
Linus Torvalds's avatar
Linus Torvalds committed
1921
	int rw = rq_data_dir(rq);
Linus Torvalds's avatar
Linus Torvalds committed
1922

Jens Axboe's avatar
Jens Axboe committed
1923
	if (!blk_fs_request(rq) || !rq->rq_disk)
1924 1925 1926
		return;

	if (rw == READ) {
1927
		disk_stat_add(rq->rq_disk, read_sectors, nr_sectors);
1928
		if (!new_io)
1929
			disk_stat_inc(rq->rq_disk, read_merges);
1930
	} else if (rw == WRITE) {
1931
		disk_stat_add(rq->rq_disk, write_sectors, nr_sectors);
1932
		if (!new_io)
1933
			disk_stat_inc(rq->rq_disk, write_merges);
1934 1935 1936
	}
	if (new_io) {
		disk_round_stats(rq->rq_disk);
1937
		rq->rq_disk->in_flight++;
1938
	}
Linus Torvalds's avatar
Linus Torvalds committed
1939 1940 1941 1942
}

/*
 * add-request adds a request to the linked list.
Linus Torvalds's avatar
Linus Torvalds committed
1943
 * queue lock is held and interrupts disabled, as we muck with the
Linus Torvalds's avatar
Linus Torvalds committed
1944
 * request queue list.
Linus Torvalds's avatar
Linus Torvalds committed
1945
 */
Jens Axboe's avatar
Jens Axboe committed
1946
static inline void add_request(request_queue_t * q, struct request * req)
Linus Torvalds's avatar
Linus Torvalds committed
1947
{
Linus Torvalds's avatar
Linus Torvalds committed
1948
	drive_stat_acct(req, req->nr_sectors, 1);
Linus Torvalds's avatar
Linus Torvalds committed
1949

Jens Axboe's avatar
Jens Axboe committed
1950 1951 1952
	if (q->activity_fn)
		q->activity_fn(q->activity_data, rq_data_dir(req));

Linus Torvalds's avatar
Linus Torvalds committed
1953
	/*
Linus Torvalds's avatar
Linus Torvalds committed
1954 1955
	 * elevator indicated where it wants this request to be
	 * inserted at elevator_merge time
Linus Torvalds's avatar
Linus Torvalds committed
1956
	 */
Jens Axboe's avatar
Jens Axboe committed
1957
	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
Linus Torvalds's avatar
Linus Torvalds committed
1958
}
1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978
 
/*
 * disk_round_stats()	- Round off the performance stats on a struct
 * disk_stats.
 *
 * The average IO queue length and utilisation statistics are maintained
 * by observing the current state of the queue length and the amount of
 * time it has been in this state for.
 *
 * Normally, that accounting is done on IO completion, but that can result
 * in more than a second's worth of IO being accounted for within any one
 * second, leading to >100% utilisation.  To deal with that, we call this
 * function to do a round-off before returning the results when reading
 * /proc/diskstats.  This accounts immediately for all queue usage up to
 * the current jiffies and restarts the counters again.
 */
void disk_round_stats(struct gendisk *disk)
{
	unsigned long now = jiffies;

1979
	disk_stat_add(disk, time_in_queue, 
1980
			disk->in_flight * (now - disk->stamp));
1981 1982
	disk->stamp = now;

1983
	if (disk->in_flight)
1984
		disk_stat_add(disk, io_ticks, (now - disk->stamp_idle));
1985 1986
	disk->stamp_idle = now;
}
Linus Torvalds's avatar
Linus Torvalds committed
1987

1988 1989 1990
/*
 * queue lock must be held
 */
1991
void __blk_put_request(request_queue_t *q, struct request *req)
Linus Torvalds's avatar
Linus Torvalds committed
1992
{
Linus Torvalds's avatar
Linus Torvalds committed
1993
	struct request_list *rl = req->rl;
1994 1995 1996

	if (unlikely(!q))
		return;
1997 1998
	if (unlikely(--req->ref_count))
		return;
Linus Torvalds's avatar
Linus Torvalds committed
1999

Linus Torvalds's avatar
Linus Torvalds committed
2000
	req->rq_status = RQ_INACTIVE;
Linus Torvalds's avatar
Linus Torvalds committed
2001
	req->q = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
2002
	req->rl = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
2003 2004

	/*
Linus Torvalds's avatar
Linus Torvalds committed
2005
	 * Request may not have originated from ll_rw_blk. if not,
Linus Torvalds's avatar
Linus Torvalds committed
2006
	 * it didn't come out of our reserved rq pools
Linus Torvalds's avatar
Linus Torvalds committed
2007
	 */
Linus Torvalds's avatar
Linus Torvalds committed
2008
	if (rl) {
2009
		int rw = rq_data_dir(req);
2010

2011 2012
		elv_completed_request(q, req);

2013 2014
		BUG_ON(!list_empty(&req->queuelist));

2015
		blk_free_request(q, req);
2016
		freed_request(q, rw);
Linus Torvalds's avatar
Linus Torvalds committed
2017 2018 2019
	}
}

2020 2021 2022
void blk_put_request(struct request *req)
{
	/*
2023
	 * if req->rl isn't set, this request didnt originate from the
2024 2025
	 * block layer, so it's safe to just disregard it
	 */
2026
	if (req->rl) {
2027
		unsigned long flags;
2028
		request_queue_t *q = req->q;
2029 2030 2031 2032 2033 2034 2035

		spin_lock_irqsave(q->queue_lock, flags);
		__blk_put_request(q, req);
		spin_unlock_irqrestore(q->queue_lock, flags);
	}
}

2036 2037
EXPORT_SYMBOL(blk_put_request);

2038 2039 2040 2041 2042 2043
/**
 * blk_congestion_wait - wait for a queue to become uncongested
 * @rw: READ or WRITE
 * @timeout: timeout in jiffies
 *
 * Waits for up to @timeout jiffies for a queue (any queue) to exit congestion.
2044 2045
 * If no queues are congested then just wait for the next request to be
 * returned.
2046
 */
2047
long blk_congestion_wait(int rw, long timeout)
2048
{
2049
	long ret;
2050
	DEFINE_WAIT(wait);
2051
	wait_queue_head_t *wqh = &congestion_wqh[rw];
2052 2053

	blk_run_queues();
2054
	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
2055
	ret = io_schedule_timeout(timeout);
2056
	finish_wait(wqh, &wait);
2057
	return ret;
2058 2059
}

2060 2061
EXPORT_SYMBOL(blk_congestion_wait);

Linus Torvalds's avatar
Linus Torvalds committed
2062 2063 2064
/*
 * Has to be called with the request spinlock acquired
 */
2065
static int attempt_merge(request_queue_t *q, struct request *req,
Linus Torvalds's avatar
Linus Torvalds committed
2066
			  struct request *next)
Linus Torvalds's avatar
Linus Torvalds committed
2067
{
Linus Torvalds's avatar
Linus Torvalds committed
2068
	if (!rq_mergeable(req) || !rq_mergeable(next))
2069
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
2070 2071 2072 2073

	/*
	 * not contigious
	 */
Linus Torvalds's avatar
Linus Torvalds committed
2074
	if (req->sector + req->nr_sectors != next->sector)
2075
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
2076

Linus Torvalds's avatar
Linus Torvalds committed
2077
	if (rq_data_dir(req) != rq_data_dir(next)
2078
	    || req->rq_disk != next->rq_disk
Linus Torvalds's avatar
Linus Torvalds committed
2079
	    || next->waiting || next->special)
2080
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
2081

Linus Torvalds's avatar
Linus Torvalds committed
2082
	/*
Linus Torvalds's avatar
Linus Torvalds committed
2083 2084 2085 2086
	 * If we are allowed to merge, then append bio list
	 * from next to rq and release next. merge_requests_fn
	 * will have updated segment counts, update sector
	 * counts here.
Linus Torvalds's avatar
Linus Torvalds committed
2087
	 */
2088 2089
	if (!q->merge_requests_fn(q, req, next))
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
2090

2091 2092 2093 2094 2095 2096 2097 2098 2099
	/*
	 * At this point we have either done a back merge
	 * or front merge. We need the smaller start_time of
	 * the merged requests to be the current request
	 * for accounting purposes.
	 */
	if (time_after(req->start_time, next->start_time))
		req->start_time = next->start_time;

2100 2101
	req->biotail->bi_next = next->bio;
	req->biotail = next->biotail;
Linus Torvalds's avatar
Linus Torvalds committed
2102

2103
	req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
2104

2105
	elv_merge_requests(q, req, next);
2106

2107 2108
	if (req->rq_disk) {
		disk_round_stats(req->rq_disk);
2109
		req->rq_disk->in_flight--;
Linus Torvalds's avatar
Linus Torvalds committed
2110
	}
2111

2112 2113
	__blk_put_request(q, next);
	return 1;
Linus Torvalds's avatar
Linus Torvalds committed
2114 2115
}

2116
static inline int attempt_back_merge(request_queue_t *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
2117
{
2118
	struct request *next = elv_latter_request(q, rq);
Linus Torvalds's avatar
Linus Torvalds committed
2119

2120 2121 2122 2123
	if (next)
		return attempt_merge(q, rq, next);

	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
2124 2125
}

2126
static inline int attempt_front_merge(request_queue_t *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
2127
{
2128
	struct request *prev = elv_former_request(q, rq);
Linus Torvalds's avatar
Linus Torvalds committed
2129

2130 2131 2132 2133
	if (prev)
		return attempt_merge(q, prev, rq);

	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
2134
}
Linus Torvalds's avatar
Linus Torvalds committed
2135

Linus Torvalds's avatar
Linus Torvalds committed
2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151
/**
 * blk_attempt_remerge  - attempt to remerge active head with next request
 * @q:    The &request_queue_t belonging to the device
 * @rq:   The head request (usually)
 *
 * Description:
 *    For head-active devices, the queue can easily be unplugged so quickly
 *    that proper merging is not done on the front request. This may hurt
 *    performance greatly for some devices. The block layer cannot safely
 *    do merging on that first request for these queues, but the driver can
 *    call this function and make it happen any way. Only the driver knows
 *    when it is safe to do so.
 **/
void blk_attempt_remerge(request_queue_t *q, struct request *rq)
{
	unsigned long flags;
Linus Torvalds's avatar
Linus Torvalds committed
2152

Linus Torvalds's avatar
Linus Torvalds committed
2153
	spin_lock_irqsave(q->queue_lock, flags);
Linus Torvalds's avatar
Linus Torvalds committed
2154
	attempt_back_merge(q, rq);
Linus Torvalds's avatar
Linus Torvalds committed
2155
	spin_unlock_irqrestore(q->queue_lock, flags);
Linus Torvalds's avatar
Linus Torvalds committed
2156
}
Linus Torvalds's avatar
Linus Torvalds committed
2157

2158 2159
EXPORT_SYMBOL(blk_attempt_remerge);

Martin Dalecki's avatar
Martin Dalecki committed
2160 2161 2162 2163 2164 2165 2166 2167
/*
 * Non-locking blk_attempt_remerge variant.
 */
void __blk_attempt_remerge(request_queue_t *q, struct request *rq)
{
	attempt_back_merge(q, rq);
}

2168 2169
EXPORT_SYMBOL(__blk_attempt_remerge);

Linus Torvalds's avatar
Linus Torvalds committed
2170 2171 2172
static int __make_request(request_queue_t *q, struct bio *bio)
{
	struct request *req, *freereq = NULL;
Jens Axboe's avatar
Jens Axboe committed
2173
	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
Linus Torvalds's avatar
Linus Torvalds committed
2174
	sector_t sector;
Linus Torvalds's avatar
Linus Torvalds committed
2175

Linus Torvalds's avatar
Linus Torvalds committed
2176 2177
	sector = bio->bi_sector;
	nr_sectors = bio_sectors(bio);
Jens Axboe's avatar
Jens Axboe committed
2178 2179
	cur_nr_sectors = bio_cur_sectors(bio);

Linus Torvalds's avatar
Linus Torvalds committed
2180
	rw = bio_data_dir(bio);
Linus Torvalds's avatar
Linus Torvalds committed
2181 2182

	/*
Linus Torvalds's avatar
Linus Torvalds committed
2183 2184 2185
	 * low level driver can indicate that it wants pages above a
	 * certain limit bounced to low memory (ie for highmem, or even
	 * ISA dma in theory)
Linus Torvalds's avatar
Linus Torvalds committed
2186
	 */
Linus Torvalds's avatar
Linus Torvalds committed
2187 2188
	blk_queue_bounce(q, &bio);

Linus Torvalds's avatar
Linus Torvalds committed
2189
	spin_lock_prefetch(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
2190

Linus Torvalds's avatar
Linus Torvalds committed
2191
	barrier = test_bit(BIO_RW_BARRIER, &bio->bi_rw);
Linus Torvalds's avatar
Linus Torvalds committed
2192

Jens Axboe's avatar
Jens Axboe committed
2193
	ra = bio->bi_rw & (1 << BIO_RW_AHEAD);
Jens Axboe's avatar
Jens Axboe committed
2194

Linus Torvalds's avatar
Linus Torvalds committed
2195
again:
2196
	spin_lock_irq(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
2197

Jens Axboe's avatar
Jens Axboe committed
2198
	if (elv_queue_empty(q)) {
Linus Torvalds's avatar
Linus Torvalds committed
2199
		blk_plug_device(q);
Linus Torvalds's avatar
Linus Torvalds committed
2200
		goto get_rq;
Linus Torvalds's avatar
Linus Torvalds committed
2201
	}
Jens Axboe's avatar
Jens Axboe committed
2202 2203
	if (barrier)
		goto get_rq;
Linus Torvalds's avatar
Linus Torvalds committed
2204

Jens Axboe's avatar
Jens Axboe committed
2205
	el_ret = elv_merge(q, &req, bio);
Linus Torvalds's avatar
Linus Torvalds committed
2206
	switch (el_ret) {
Linus Torvalds's avatar
Linus Torvalds committed
2207
		case ELEVATOR_BACK_MERGE:
Linus Torvalds's avatar
Linus Torvalds committed
2208
			BUG_ON(!rq_mergeable(req));
Jens Axboe's avatar
Jens Axboe committed
2209

Jens Axboe's avatar
Jens Axboe committed
2210
			if (!q->back_merge_fn(q, req, bio))
Linus Torvalds's avatar
Linus Torvalds committed
2211
				break;
Linus Torvalds's avatar
Linus Torvalds committed
2212

Linus Torvalds's avatar
Linus Torvalds committed
2213 2214 2215
			req->biotail->bi_next = bio;
			req->biotail = bio;
			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
Linus Torvalds's avatar
Linus Torvalds committed
2216
			drive_stat_acct(req, nr_sectors, 0);
2217 2218
			if (!attempt_back_merge(q, req))
				elv_merged_request(q, req);
Linus Torvalds's avatar
Linus Torvalds committed
2219 2220 2221
			goto out;

		case ELEVATOR_FRONT_MERGE:
Linus Torvalds's avatar
Linus Torvalds committed
2222
			BUG_ON(!rq_mergeable(req));
Jens Axboe's avatar
Jens Axboe committed
2223

Jens Axboe's avatar
Jens Axboe committed
2224
			if (!q->front_merge_fn(q, req, bio))
Linus Torvalds's avatar
Linus Torvalds committed
2225
				break;
Linus Torvalds's avatar
Linus Torvalds committed
2226

Linus Torvalds's avatar
Linus Torvalds committed
2227
			bio->bi_next = req->bio;
Jens Axboe's avatar
Jens Axboe committed
2228 2229 2230 2231
			req->cbio = req->bio = bio;
			req->nr_cbio_segments = bio_segments(bio);
			req->nr_cbio_sectors = bio_sectors(bio);

Linus Torvalds's avatar
Linus Torvalds committed
2232 2233 2234 2235 2236 2237 2238 2239
			/*
			 * may not be valid. if the low level driver said
			 * it didn't need a bounce buffer then it better
			 * not touch req->buffer either...
			 */
			req->buffer = bio_data(bio);
			req->current_nr_sectors = cur_nr_sectors;
			req->hard_cur_sectors = cur_nr_sectors;
Linus Torvalds's avatar
Linus Torvalds committed
2240
			req->sector = req->hard_sector = sector;
Linus Torvalds's avatar
Linus Torvalds committed
2241
			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
Linus Torvalds's avatar
Linus Torvalds committed
2242
			drive_stat_acct(req, nr_sectors, 0);
2243 2244
			if (!attempt_front_merge(q, req))
				elv_merged_request(q, req);
Linus Torvalds's avatar
Linus Torvalds committed
2245
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
2246

Linus Torvalds's avatar
Linus Torvalds committed
2247 2248 2249 2250 2251 2252 2253 2254 2255 2256
		/*
		 * elevator says don't/can't merge. get new request
		 */
		case ELEVATOR_NO_MERGE:
			break;

		default:
			printk("elevator returned crap (%d)\n", el_ret);
			BUG();
	}
Linus Torvalds's avatar
Linus Torvalds committed
2257

Linus Torvalds's avatar
Linus Torvalds committed
2258
	/*
Linus Torvalds's avatar
Linus Torvalds committed
2259 2260 2261
	 * Grab a free request from the freelist - if that is empty, check
	 * if we are doing read ahead and abort instead of blocking for
	 * a free slot.
Linus Torvalds's avatar
Linus Torvalds committed
2262 2263 2264 2265 2266
	 */
get_rq:
	if (freereq) {
		req = freereq;
		freereq = NULL;
2267
	} else {
Linus Torvalds's avatar
Linus Torvalds committed
2268
		spin_unlock_irq(q->queue_lock);
2269
		if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) {
2270 2271 2272
			/*
			 * READA bit set
			 */
Jens Axboe's avatar
Jens Axboe committed
2273
			if (ra)
2274 2275 2276 2277
				goto end_io;
	
			freereq = get_request_wait(q, rw);
		}
Linus Torvalds's avatar
Linus Torvalds committed
2278 2279 2280
		goto again;
	}

2281 2282
	req->flags |= REQ_CMD;

Linus Torvalds's avatar
Linus Torvalds committed
2283
	/*
2284 2285
	 * inherit FAILFAST from bio and don't stack up
	 * retries for read ahead
Linus Torvalds's avatar
Linus Torvalds committed
2286
	 */
2287 2288
	if (ra || test_bit(BIO_RW_FAILFAST, &bio->bi_rw))	
		req->flags |= REQ_FAILFAST;
Linus Torvalds's avatar
Linus Torvalds committed
2289 2290 2291 2292 2293

	/*
	 * REQ_BARRIER implies no merging, but lets make it explicit
	 */
	if (barrier)
Jens Axboe's avatar
Jens Axboe committed
2294
		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
Linus Torvalds's avatar
Linus Torvalds committed
2295

Linus Torvalds's avatar
Linus Torvalds committed
2296 2297
	req->errors = 0;
	req->hard_sector = req->sector = sector;
Linus Torvalds's avatar
Linus Torvalds committed
2298 2299
	req->hard_nr_sectors = req->nr_sectors = nr_sectors;
	req->current_nr_sectors = req->hard_cur_sectors = cur_nr_sectors;
Linus Torvalds's avatar
Linus Torvalds committed
2300
	req->nr_phys_segments = bio_phys_segments(q, bio);
Linus Torvalds's avatar
Linus Torvalds committed
2301
	req->nr_hw_segments = bio_hw_segments(q, bio);
Jens Axboe's avatar
Jens Axboe committed
2302 2303
	req->nr_cbio_segments = bio_segments(bio);
	req->nr_cbio_sectors = bio_sectors(bio);
Linus Torvalds's avatar
Linus Torvalds committed
2304
	req->buffer = bio_data(bio);	/* see ->buffer comment above */
Linus Torvalds's avatar
Linus Torvalds committed
2305
	req->waiting = NULL;
Jens Axboe's avatar
Jens Axboe committed
2306
	req->cbio = req->bio = req->biotail = bio;
2307
	req->rq_disk = bio->bi_bdev->bd_disk;
2308
	req->start_time = jiffies;
2309

Jens Axboe's avatar
Jens Axboe committed
2310
	add_request(q, req);
Linus Torvalds's avatar
Linus Torvalds committed
2311
out:
Linus Torvalds's avatar
Linus Torvalds committed
2312
	if (freereq)
2313
		__blk_put_request(q, freereq);
2314 2315

	if (blk_queue_plugged(q)) {
2316
		int nr_queued = q->rq.count[READ] + q->rq.count[WRITE];
2317

2318 2319 2320
		if (nr_queued == q->unplug_thresh)
			__generic_unplug_device(q);
	}
Linus Torvalds's avatar
Linus Torvalds committed
2321
	spin_unlock_irq(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
2322
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
2323

Linus Torvalds's avatar
Linus Torvalds committed
2324
end_io:
2325
	bio_endio(bio, nr_sectors << 9, -EWOULDBLOCK);
Linus Torvalds's avatar
Linus Torvalds committed
2326 2327 2328
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
2329 2330 2331 2332 2333
/*
 * If bio->bi_dev is a partition, remap the location
 */
static inline void blk_partition_remap(struct bio *bio)
{
2334 2335
	struct block_device *bdev = bio->bi_bdev;

2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350
	if (bdev != bdev->bd_contains) {
		struct hd_struct *p = bdev->bd_part;

		switch (bio->bi_rw) {
		case READ:
			p->read_sectors += bio_sectors(bio);
			p->reads++;
			break;
		case WRITE:
			p->write_sectors += bio_sectors(bio);
			p->writes++;
			break;
		}
		bio->bi_sector += p->start_sect;
		bio->bi_bdev = bdev->bd_contains;
2351
	}
Linus Torvalds's avatar
Linus Torvalds committed
2352 2353
}

Linus Torvalds's avatar
Linus Torvalds committed
2354
/**
2355
 * generic_make_request: hand a buffer to its device driver for I/O
Linus Torvalds's avatar
Linus Torvalds committed
2356
 * @bio:  The bio describing the location in memory and on the device.
Linus Torvalds's avatar
Linus Torvalds committed
2357 2358
 *
 * generic_make_request() is used to make I/O requests of block
Linus Torvalds's avatar
Linus Torvalds committed
2359 2360
 * devices. It is passed a &struct bio, which describes the I/O that needs
 * to be done.
Linus Torvalds's avatar
Linus Torvalds committed
2361 2362 2363
 *
 * generic_make_request() does not return any status.  The
 * success/failure status of the request, along with notification of
Linus Torvalds's avatar
Linus Torvalds committed
2364
 * completion, is delivered asynchronously through the bio->bi_end_io
Linus Torvalds's avatar
Linus Torvalds committed
2365 2366
 * function described (one day) else where.
 *
Linus Torvalds's avatar
Linus Torvalds committed
2367 2368
 * The caller of generic_make_request must make sure that bi_io_vec
 * are set to describe the memory buffer, and that bi_dev and bi_sector are
Linus Torvalds's avatar
Linus Torvalds committed
2369
 * set to describe the device address, and the
Linus Torvalds's avatar
Linus Torvalds committed
2370 2371
 * bi_end_io and optionally bi_private are set to describe how
 * completion notification should be signaled.
Linus Torvalds's avatar
Linus Torvalds committed
2372
 *
Linus Torvalds's avatar
Linus Torvalds committed
2373 2374
 * generic_make_request and the drivers it calls may use bi_next if this
 * bio happens to be merged with someone else, and may change bi_dev and
Linus Torvalds's avatar
Linus Torvalds committed
2375
 * bi_sector for remaps as it sees fit.  So the values of these fields
Linus Torvalds's avatar
Linus Torvalds committed
2376
 * should NOT be depended on after the call to generic_make_request.
2377
 */
Linus Torvalds's avatar
Linus Torvalds committed
2378
void generic_make_request(struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
2379 2380
{
	request_queue_t *q;
Andries E. Brouwer's avatar
Andries E. Brouwer committed
2381
	sector_t maxsector;
Linus Torvalds's avatar
Linus Torvalds committed
2382
	int ret, nr_sectors = bio_sectors(bio);
Linus Torvalds's avatar
Linus Torvalds committed
2383

Linus Torvalds's avatar
Linus Torvalds committed
2384
	/* Test device or partition size, when known. */
2385
	maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
Andries E. Brouwer's avatar
Andries E. Brouwer committed
2386 2387 2388 2389 2390
	if (maxsector) {
		sector_t sector = bio->bi_sector;

		if (maxsector < nr_sectors ||
		    maxsector - nr_sectors < sector) {
2391
			char b[BDEVNAME_SIZE];
Andries E. Brouwer's avatar
Andries E. Brouwer committed
2392 2393 2394 2395 2396
			/* This may well happen - the kernel calls
			 * bread() without checking the size of the
			 * device, e.g., when mounting a device. */
			printk(KERN_INFO
			       "attempt to access beyond end of device\n");
2397
			printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
2398
			       bdevname(bio->bi_bdev, b),
2399
			       bio->bi_rw,
2400
			       (unsigned long long) sector + nr_sectors,
Andries E. Brouwer's avatar
Andries E. Brouwer committed
2401 2402
			       (long long) maxsector);

Linus Torvalds's avatar
Linus Torvalds committed
2403 2404
			set_bit(BIO_EOF, &bio->bi_flags);
			goto end_io;
Linus Torvalds's avatar
Linus Torvalds committed
2405 2406 2407 2408 2409 2410 2411
		}
	}

	/*
	 * Resolve the mapping until finished. (drivers are
	 * still free to implement/resolve their own stacking
	 * by explicitly returning 0)
Linus Torvalds's avatar
Linus Torvalds committed
2412 2413
	 *
	 * NOTE: we don't repeat the blk_size check for each new device.
Linus Torvalds's avatar
Linus Torvalds committed
2414 2415 2416
	 * Stacking drivers are expected to know what they are doing.
	 */
	do {
2417 2418
		char b[BDEVNAME_SIZE];

2419
		q = bdev_get_queue(bio->bi_bdev);
Linus Torvalds's avatar
Linus Torvalds committed
2420 2421
		if (!q) {
			printk(KERN_ERR
2422 2423 2424 2425
			       "generic_make_request: Trying to access "
				"nonexistent block-device %s (%Lu)\n",
				bdevname(bio->bi_bdev, b),
				(long long) bio->bi_sector);
Linus Torvalds's avatar
Linus Torvalds committed
2426
end_io:
2427
			bio_endio(bio, bio->bi_size, -EIO);
Linus Torvalds's avatar
Linus Torvalds committed
2428 2429
			break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
2430

2431
		if (unlikely(bio_sectors(bio) > q->max_sectors)) {
2432
			printk("bio too big device %s (%u > %u)\n", 
2433 2434 2435
				bdevname(bio->bi_bdev, b),
				bio_sectors(bio),
				q->max_sectors);
2436 2437
			goto end_io;
		}
Linus Torvalds's avatar
Linus Torvalds committed
2438

2439 2440 2441
		if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))
			goto end_io;

Linus Torvalds's avatar
Linus Torvalds committed
2442 2443 2444 2445 2446 2447
		/*
		 * If this device has partitions, remap block n
		 * of partition p to block n+start(p) of the disk.
		 */
		blk_partition_remap(bio);

Linus Torvalds's avatar
Linus Torvalds committed
2448 2449
		ret = q->make_request_fn(q, bio);
	} while (ret);
Linus Torvalds's avatar
Linus Torvalds committed
2450 2451
}

2452 2453
EXPORT_SYMBOL(generic_make_request);

Linus Torvalds's avatar
Linus Torvalds committed
2454
/**
Linus Torvalds's avatar
Linus Torvalds committed
2455
 * submit_bio: submit a bio to the block device layer for I/O
Linus Torvalds's avatar
Linus Torvalds committed
2456
 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
Linus Torvalds's avatar
Linus Torvalds committed
2457
 * @bio: The &struct bio which describes the I/O
Linus Torvalds's avatar
Linus Torvalds committed
2458
 *
Linus Torvalds's avatar
Linus Torvalds committed
2459 2460 2461
 * submit_bio() is very similar in purpose to generic_make_request(), and
 * uses that function to do most of the work. Both are fairly rough
 * interfaces, @bio must be presetup and ready for I/O.
Linus Torvalds's avatar
Linus Torvalds committed
2462 2463
 *
 */
Linus Torvalds's avatar
Linus Torvalds committed
2464
int submit_bio(int rw, struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
2465
{
2466
	int count = bio_sectors(bio);
Linus Torvalds's avatar
Linus Torvalds committed
2467

Linus Torvalds's avatar
Linus Torvalds committed
2468
	BIO_BUG_ON(!bio->bi_size);
Linus Torvalds's avatar
Linus Torvalds committed
2469 2470 2471
	BIO_BUG_ON(!bio->bi_io_vec);
	bio->bi_rw = rw;
	if (rw & WRITE)
2472
		mod_page_state(pgpgout, count);
Linus Torvalds's avatar
Linus Torvalds committed
2473
	else
2474
		mod_page_state(pgpgin, count);
Andrew Morton's avatar
Andrew Morton committed
2475 2476 2477 2478 2479 2480 2481 2482 2483 2484

	if (unlikely(block_dump)) {
		char b[BDEVNAME_SIZE];
		printk("%s(%d): %s block %Lu on %s\n",
			current->comm, current->pid,
			(rw & WRITE) ? "WRITE" : "READ",
			(unsigned long long)bio->bi_sector,
			bdevname(bio->bi_bdev,b));
	}

Linus Torvalds's avatar
Linus Torvalds committed
2485 2486 2487 2488
	generic_make_request(bio);
	return 1;
}

2489 2490
EXPORT_SYMBOL(submit_bio);

Jens Axboe's avatar
Jens Axboe committed
2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565
/**
 * blk_rq_next_segment
 * @rq:		the request being processed
 *
 * Description:
 *	Points to the next segment in the request if the current segment
 *	is complete. Leaves things unchanged if this segment is not over
 *	or if no more segments are left in this request.
 *
 *	Meant to be used for bio traversal during I/O submission
 *	Does not affect any I/O completions or update completion state
 *	in the request, and does not modify any bio fields.
 *
 *	Decrementing rq->nr_sectors, rq->current_nr_sectors and
 *	rq->nr_cbio_sectors as data is transferred is the caller's
 *	responsibility and should be done before calling this routine.
 **/
void blk_rq_next_segment(struct request *rq)
{
	if (rq->current_nr_sectors > 0)
		return;

	if (rq->nr_cbio_sectors > 0) {
		--rq->nr_cbio_segments;
		rq->current_nr_sectors = blk_rq_vec(rq)->bv_len >> 9;
	} else {
		if ((rq->cbio = rq->cbio->bi_next)) {
			rq->nr_cbio_segments = bio_segments(rq->cbio);
			rq->nr_cbio_sectors = bio_sectors(rq->cbio);
 			rq->current_nr_sectors = bio_cur_sectors(rq->cbio);
		}
 	}

	/* remember the size of this segment before we start I/O */
	rq->hard_cur_sectors = rq->current_nr_sectors;
}

/**
 * process_that_request_first	-	process partial request submission
 * @req:	the request being processed
 * @nr_sectors:	number of sectors I/O has been submitted on
 *
 * Description:
 *	May be used for processing bio's while submitting I/O without
 *	signalling completion. Fails if more data is requested than is
 *	available in the request in which case it doesn't advance any
 *	pointers.
 *
 *	Assumes a request is correctly set up. No sanity checks.
 *
 * Return:
 *	0 - no more data left to submit (not processed)
 *	1 - data available to submit for this request (processed)
 **/
int process_that_request_first(struct request *req, unsigned int nr_sectors)
{
	unsigned int nsect;

	if (req->nr_sectors < nr_sectors)
		return 0;

	req->nr_sectors -= nr_sectors;
	req->sector += nr_sectors;
	while (nr_sectors) {
		nsect = min_t(unsigned, req->current_nr_sectors, nr_sectors);
		req->current_nr_sectors -= nsect;
		nr_sectors -= nsect;
		if (req->cbio) {
			req->nr_cbio_sectors -= nsect;
			blk_rq_next_segment(req);
		}
	}
	return 1;
}

2566 2567
EXPORT_SYMBOL(process_that_request_first);

2568
void blk_recalc_rq_segments(struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
2569 2570 2571 2572
{
	struct bio *bio;
	int nr_phys_segs, nr_hw_segs;

2573 2574 2575
	if (!rq->bio)
		return;

Linus Torvalds's avatar
Linus Torvalds committed
2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588
	nr_phys_segs = nr_hw_segs = 0;
	rq_for_each_bio(bio, rq) {
		/* Force bio hw/phys segs to be recalculated. */
		bio->bi_flags &= ~(1 << BIO_SEG_VALID);

		nr_phys_segs += bio_phys_segments(rq->q, bio);
		nr_hw_segs += bio_hw_segments(rq->q, bio);
	}

	rq->nr_phys_segments = nr_phys_segs;
	rq->nr_hw_segments = nr_hw_segs;
}

2589
void blk_recalc_rq_sectors(struct request *rq, int nsect)
Linus Torvalds's avatar
Linus Torvalds committed
2590
{
2591
	if (blk_fs_request(rq)) {
Linus Torvalds's avatar
Linus Torvalds committed
2592
		rq->hard_sector += nsect;
Jens Axboe's avatar
Jens Axboe committed
2593
		rq->hard_nr_sectors -= nsect;
Linus Torvalds's avatar
Linus Torvalds committed
2594

Jens Axboe's avatar
Jens Axboe committed
2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610
		/*
		 * Move the I/O submission pointers ahead if required,
		 * i.e. for drivers not aware of rq->cbio.
		 */
		if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
		    (rq->sector <= rq->hard_sector)) {
			rq->sector = rq->hard_sector;
			rq->nr_sectors = rq->hard_nr_sectors;
			rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
			rq->current_nr_sectors = rq->hard_cur_sectors;
			rq->nr_cbio_segments = bio_segments(rq->bio);
			rq->nr_cbio_sectors = bio_sectors(rq->bio);
			rq->buffer = bio_data(rq->bio);

			rq->cbio = rq->bio;
		}
Linus Torvalds's avatar
Linus Torvalds committed
2611

Linus Torvalds's avatar
Linus Torvalds committed
2612 2613 2614 2615 2616 2617 2618 2619
		/*
		 * if total number of sectors is less than the first segment
		 * size, something has gone terribly wrong
		 */
		if (rq->nr_sectors < rq->current_nr_sectors) {
			printk("blk: request botched\n");
			rq->nr_sectors = rq->current_nr_sectors;
		}
Linus Torvalds's avatar
Linus Torvalds committed
2620 2621 2622
	}
}

2623 2624
static int __end_that_request_first(struct request *req, int uptodate,
				    int nr_bytes)
Linus Torvalds's avatar
Linus Torvalds committed
2625
{
2626
	int total_bytes, bio_nbytes, error = 0, next_idx = 0;
Linus Torvalds's avatar
Linus Torvalds committed
2627
	struct bio *bio;
Linus Torvalds's avatar
Linus Torvalds committed
2628

2629 2630 2631 2632 2633 2634 2635
	/*
	 * for a REQ_BLOCK_PC request, we want to carry any eventual
	 * sense key with us all the way through
	 */
	if (!blk_pc_request(req))
		req->errors = 0;

Jens Axboe's avatar
Jens Axboe committed
2636 2637
	if (!uptodate) {
		error = -EIO;
2638
		if (blk_fs_request(req) && !(req->flags & REQ_QUIET))
Jens Axboe's avatar
Jens Axboe committed
2639
			printk("end_request: I/O error, dev %s, sector %llu\n",
2640
				req->rq_disk ? req->rq_disk->disk_name : "?",
Jens Axboe's avatar
Jens Axboe committed
2641
				(unsigned long long)req->sector);
Jens Axboe's avatar
Jens Axboe committed
2642
	}
Linus Torvalds's avatar
Linus Torvalds committed
2643

2644
	total_bytes = bio_nbytes = 0;
Linus Torvalds's avatar
Linus Torvalds committed
2645
	while ((bio = req->bio)) {
2646
		int nbytes;
Jens Axboe's avatar
Jens Axboe committed
2647

2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660
		if (nr_bytes >= bio->bi_size) {
			req->bio = bio->bi_next;
			nbytes = bio->bi_size;
			bio_endio(bio, nbytes, error);
			next_idx = 0;
			bio_nbytes = 0;
		} else {
			int idx = bio->bi_idx + next_idx;

			if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
				blk_dump_rq_flags(req, "__end_that");
				printk("%s: bio idx %d >= vcnt %d\n",
						__FUNCTION__,
Jens Axboe's avatar
Jens Axboe committed
2661
						bio->bi_idx, bio->bi_vcnt);
2662 2663
				break;
			}
Linus Torvalds's avatar
Linus Torvalds committed
2664

2665 2666
			nbytes = bio_iovec_idx(bio, idx)->bv_len;
			BIO_BUG_ON(nbytes > bio->bi_size);
Linus Torvalds's avatar
Linus Torvalds committed
2667

2668 2669 2670 2671 2672 2673 2674 2675
			/*
			 * not a complete bvec done
			 */
			if (unlikely(nbytes > nr_bytes)) {
				bio_nbytes += nr_bytes;
				total_bytes += nr_bytes;
				break;
			}
Linus Torvalds's avatar
Linus Torvalds committed
2676

2677 2678 2679 2680 2681
			/*
			 * advance to the next vector
			 */
			next_idx++;
			bio_nbytes += nbytes;
2682
		}
Linus Torvalds's avatar
Linus Torvalds committed
2683

2684 2685
		total_bytes += nbytes;
		nr_bytes -= nbytes;
Jens Axboe's avatar
Jens Axboe committed
2686

Linus Torvalds's avatar
Linus Torvalds committed
2687
		if ((bio = req->bio)) {
Linus Torvalds's avatar
Linus Torvalds committed
2688 2689 2690
			/*
			 * end more in this run, or just return 'not-done'
			 */
2691
			if (unlikely(nr_bytes <= 0))
Jens Axboe's avatar
Jens Axboe committed
2692
				break;
Linus Torvalds's avatar
Linus Torvalds committed
2693 2694
		}
	}
Linus Torvalds's avatar
Linus Torvalds committed
2695

Jens Axboe's avatar
Jens Axboe committed
2696 2697 2698 2699 2700 2701 2702 2703 2704
	/*
	 * completely done
	 */
	if (!req->bio)
		return 0;

	/*
	 * if the request wasn't completed, update state
	 */
2705 2706
	if (bio_nbytes) {
		bio_endio(bio, bio_nbytes, error);
2707 2708 2709
		bio->bi_idx += next_idx;
		bio_iovec(bio)->bv_offset += nr_bytes;
		bio_iovec(bio)->bv_len -= nr_bytes;
2710 2711 2712
	}

	blk_recalc_rq_sectors(req, total_bytes >> 9);
Jens Axboe's avatar
Jens Axboe committed
2713 2714
	blk_recalc_rq_segments(req);
	return 1;
Linus Torvalds's avatar
Linus Torvalds committed
2715 2716
}

2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735
/**
 * end_that_request_first - end I/O on a request
 * @req:      the request being processed
 * @uptodate: 0 for I/O error
 * @nr_sectors: number of sectors to end I/O on
 *
 * Description:
 *     Ends I/O on a number of sectors attached to @req, and sets it up
 *     for the next range of segments (if any) in the cluster.
 *
 * Return:
 *     0 - we are done with this request, call end_that_request_last()
 *     1 - still buffers pending for this request
 **/
int end_that_request_first(struct request *req, int uptodate, int nr_sectors)
{
	return __end_that_request_first(req, uptodate, nr_sectors << 9);
}

2736 2737
EXPORT_SYMBOL(end_that_request_first);

2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757
/**
 * end_that_request_chunk - end I/O on a request
 * @req:      the request being processed
 * @uptodate: 0 for I/O error
 * @nr_bytes: number of bytes to complete
 *
 * Description:
 *     Ends I/O on a number of bytes attached to @req, and sets it up
 *     for the next range of segments (if any). Like end_that_request_first(),
 *     but deals with bytes instead of sectors.
 *
 * Return:
 *     0 - we are done with this request, call end_that_request_last()
 *     1 - still buffers pending for this request
 **/
int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)
{
	return __end_that_request_first(req, uptodate, nr_bytes);
}

2758 2759
EXPORT_SYMBOL(end_that_request_chunk);

2760 2761 2762
/*
 * queue lock must be held
 */
Linus Torvalds's avatar
Linus Torvalds committed
2763 2764
void end_that_request_last(struct request *req)
{
2765
	struct gendisk *disk = req->rq_disk;
2766
	struct completion *waiting = req->waiting;
Linus Torvalds's avatar
Linus Torvalds committed
2767

Andrew Morton's avatar
Andrew Morton committed
2768 2769 2770
	if (unlikely(laptop_mode))
		laptop_io_completion();

2771
	if (disk && blk_fs_request(req)) {
2772 2773 2774
		unsigned long duration = jiffies - req->start_time;
		switch (rq_data_dir(req)) {
		    case WRITE:
2775 2776
			disk_stat_inc(disk, writes);
			disk_stat_add(disk, write_ticks, duration);
2777 2778
			break;
		    case READ:
2779 2780
			disk_stat_inc(disk, reads);
			disk_stat_add(disk, read_ticks, duration);
2781 2782 2783
			break;
		}
		disk_round_stats(disk);
2784
		disk->in_flight--;
2785
	}
2786
	__blk_put_request(req->q, req);
2787 2788 2789
	/* Do this LAST! The structure may be freed immediately afterwards */
	if (waiting)
		complete(waiting);
Linus Torvalds's avatar
Linus Torvalds committed
2790 2791
}

2792 2793
EXPORT_SYMBOL(end_that_request_last);

2794 2795 2796 2797 2798 2799 2800 2801 2802
void end_request(struct request *req, int uptodate)
{
	if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
		add_disk_randomness(req->rq_disk);
		blkdev_dequeue_request(req);
		end_that_request_last(req);
	}
}

2803 2804
EXPORT_SYMBOL(end_request);

2805 2806 2807 2808 2809 2810 2811 2812 2813 2814
void blk_rq_bio_prep(request_queue_t *q, struct request *rq, struct bio *bio)
{
	/* first three bits are identical in rq->flags and bio->bi_rw */
	rq->flags |= (bio->bi_rw & 7);

	rq->nr_phys_segments = bio_phys_segments(q, bio);
	rq->nr_hw_segments = bio_hw_segments(q, bio);
	rq->current_nr_sectors = bio_cur_sectors(bio);
	rq->hard_cur_sectors = rq->current_nr_sectors;
	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
Jens Axboe's avatar
Jens Axboe committed
2815 2816
	rq->nr_cbio_segments = bio_segments(bio);
	rq->nr_cbio_sectors = bio_sectors(bio);
2817 2818
	rq->buffer = bio_data(bio);

Jens Axboe's avatar
Jens Axboe committed
2819 2820 2821
	rq->cbio = rq->bio = rq->biotail = bio;
}

2822 2823
EXPORT_SYMBOL(blk_rq_bio_prep);

Jens Axboe's avatar
Jens Axboe committed
2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837
void blk_rq_prep_restart(struct request *rq)
{
	struct bio *bio;

	bio = rq->cbio = rq->bio;
	if (bio) {
		rq->nr_cbio_segments = bio_segments(bio);
		rq->nr_cbio_sectors = bio_sectors(bio);
		rq->hard_cur_sectors = bio_cur_sectors(bio);
		rq->buffer = bio_data(bio);
	}
	rq->sector = rq->hard_sector;
	rq->nr_sectors = rq->hard_nr_sectors;
	rq->current_nr_sectors = rq->hard_cur_sectors;
2838 2839
}

2840 2841
EXPORT_SYMBOL(blk_rq_prep_restart);

2842 2843 2844 2845 2846 2847 2848 2849 2850 2851
int kblockd_schedule_work(struct work_struct *work)
{
	return queue_work(kblockd_workqueue, work);
}

void kblockd_flush(void)
{
	flush_workqueue(kblockd_workqueue);
}

Linus Torvalds's avatar
Linus Torvalds committed
2852 2853
int __init blk_dev_init(void)
{
2854 2855 2856 2857
	kblockd_workqueue = create_workqueue("kblockd");
	if (!kblockd_workqueue)
		panic("Failed to create kblockd\n");

Linus Torvalds's avatar
Linus Torvalds committed
2858
	request_cachep = kmem_cache_create("blkdev_requests",
2859
			sizeof(struct request), 0, 0, NULL, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
2860 2861 2862
	if (!request_cachep)
		panic("Can't create request pool slab cache\n");

Linus Torvalds's avatar
Linus Torvalds committed
2863 2864
	blk_max_low_pfn = max_low_pfn;
	blk_max_pfn = max_pfn;
Linus Torvalds's avatar
Linus Torvalds committed
2865
	return 0;
2866
}
Linus Torvalds's avatar
Linus Torvalds committed
2867

Andrew Morton's avatar
Andrew Morton committed
2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897
/*
 * IO Context helper functions
 */
void put_io_context(struct io_context *ioc)
{
	if (ioc == NULL)
		return;

	BUG_ON(atomic_read(&ioc->refcount) == 0);

	if (atomic_dec_and_test(&ioc->refcount)) {
		if (ioc->aic && ioc->aic->dtor)
			ioc->aic->dtor(ioc->aic);
		kfree(ioc);
	}
}

/* Called by the exitting task */
void exit_io_context(void)
{
	unsigned long flags;
	struct io_context *ioc;

	local_irq_save(flags);
	ioc = current->io_context;
	if (ioc) {
		if (ioc->aic && ioc->aic->exit)
			ioc->aic->exit(ioc->aic);
		put_io_context(ioc);
		current->io_context = NULL;
2898 2899
	} else
		WARN_ON(1);
Andrew Morton's avatar
Andrew Morton committed
2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910
	local_irq_restore(flags);
}

/*
 * If the current task has no IO context then create one and initialise it.
 * If it does have a context, take a ref on it.
 *
 * This is always called in the context of the task which submitted the I/O.
 * But weird things happen, so we disable local interrupts to ensure exclusive
 * access to *current.
 */
Andrew Morton's avatar
Andrew Morton committed
2911
struct io_context *get_io_context(int gfp_flags)
Andrew Morton's avatar
Andrew Morton committed
2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923
{
	struct task_struct *tsk = current;
	unsigned long flags;
	struct io_context *ret;

	local_irq_save(flags);
	ret = tsk->io_context;
	if (ret == NULL) {
		ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
		if (ret) {
			atomic_set(&ret->refcount, 1);
			ret->pid = tsk->pid;
2924 2925
			ret->last_waited = jiffies; /* doesn't matter... */
			ret->nr_batch_requests = 0; /* because this is 0 */
Andrew Morton's avatar
Andrew Morton committed
2926 2927 2928 2929
			ret->aic = NULL;
			tsk->io_context = ret;
		}
	}
Andrew Morton's avatar
Andrew Morton committed
2930 2931
	if (ret)
		atomic_inc(&ret->refcount);
Andrew Morton's avatar
Andrew Morton committed
2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957
	local_irq_restore(flags);
	return ret;
}

void copy_io_context(struct io_context **pdst, struct io_context **psrc)
{
	struct io_context *src = *psrc;
	struct io_context *dst = *pdst;

	if (src) {
		BUG_ON(atomic_read(&src->refcount) == 0);
		atomic_inc(&src->refcount);
		put_io_context(dst);
		*pdst = src;
	}
}

void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
{
	struct io_context *temp;
	temp = *ioc1;
	*ioc1 = *ioc2;
	*ioc2 = temp;
}


2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995
/*
 * sysfs parts below
 */
struct queue_sysfs_entry {
	struct attribute attr;
	ssize_t (*show)(struct request_queue *, char *);
	ssize_t (*store)(struct request_queue *, const char *, size_t);
};

static ssize_t
queue_var_show(unsigned int var, char *page)
{
	return sprintf(page, "%d\n", var);
}

static ssize_t
queue_var_store(unsigned long *var, const char *page, size_t count)
{
	char *p = (char *) page;

	*var = simple_strtoul(p, &p, 10);
	return count;
}

static ssize_t queue_requests_show(struct request_queue *q, char *page)
{
	return queue_var_show(q->nr_requests, (page));
}

static ssize_t
queue_requests_store(struct request_queue *q, const char *page, size_t count)
{
	struct request_list *rl = &q->rq;

	int ret = queue_var_store(&q->nr_requests, page, count);
	if (q->nr_requests < BLKDEV_MIN_RQ)
		q->nr_requests = BLKDEV_MIN_RQ;

2996
	if (rl->count[READ] >= queue_congestion_on_threshold(q))
2997
		set_queue_congested(q, READ);
2998
	else if (rl->count[READ] < queue_congestion_off_threshold(q))
2999 3000
		clear_queue_congested(q, READ);

3001 3002 3003 3004
	if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
		set_queue_congested(q, WRITE);
	else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
		clear_queue_congested(q, WRITE);
3005

3006 3007
	if (rl->count[READ] >= q->nr_requests) {
		blk_set_queue_full(q, READ);
3008
	} else if (rl->count[READ]+1 <= q->nr_requests) {
3009
		blk_clear_queue_full(q, READ);
3010
		wake_up(&rl->wait[READ]);
3011 3012 3013 3014
	}

	if (rl->count[WRITE] >= q->nr_requests) {
		blk_set_queue_full(q, WRITE);
3015
	} else if (rl->count[WRITE]+1 <= q->nr_requests) {
3016
		blk_clear_queue_full(q, WRITE);
3017
		wake_up(&rl->wait[WRITE]);
3018
	}
3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077
	return ret;
}

static struct queue_sysfs_entry queue_requests_entry = {
	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
	.show = queue_requests_show,
	.store = queue_requests_store,
};

static struct attribute *default_attrs[] = {
	&queue_requests_entry.attr,
	NULL,
};

#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)

static ssize_t
queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
	struct queue_sysfs_entry *entry = to_queue(attr);
	struct request_queue *q;

	q = container_of(kobj, struct request_queue, kobj);
	if (!entry->show)
		return 0;

	return entry->show(q, page);
}

static ssize_t
queue_attr_store(struct kobject *kobj, struct attribute *attr,
		    const char *page, size_t length)
{
	struct queue_sysfs_entry *entry = to_queue(attr);
	struct request_queue *q;

	q = container_of(kobj, struct request_queue, kobj);
	if (!entry->store)
		return -EINVAL;

	return entry->store(q, page, length);
}

static struct sysfs_ops queue_sysfs_ops = {
	.show	= queue_attr_show,
	.store	= queue_attr_store,
};

struct kobj_type queue_ktype = {
	.sysfs_ops	= &queue_sysfs_ops,
	.default_attrs	= default_attrs,
};

int blk_register_queue(struct gendisk *disk)
{
	int ret;

	request_queue_t *q = disk->queue;

3078
	if (!q || !q->request_fn)
3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104
		return -ENXIO;

	q->kobj.parent = kobject_get(&disk->kobj);
	if (!q->kobj.parent)
		return -EBUSY;

	snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
	q->kobj.ktype = &queue_ktype;

	ret = kobject_register(&q->kobj);
	if (ret < 0)
		return ret;

	ret = elv_register_queue(q);
	if (ret) {
		kobject_unregister(&q->kobj);
		return ret;
	}

	return 0;
}

void blk_unregister_queue(struct gendisk *disk)
{
	request_queue_t *q = disk->queue;

3105
	if (q && q->request_fn) {
3106 3107 3108 3109 3110 3111
		elv_unregister_queue(q);

		kobject_unregister(&q->kobj);
		kobject_put(&disk->kobj);
	}
}