block: reimplement FLUSH/FUA to support merge

The current FLUSH/FUA support has evolved from the implementation which had to perform queue draining. As such, sequencing is done queue-wide one flush request after another. However, with the draining requirement gone, there's no reason to keep the queue-wide sequential approach. This patch reimplements FLUSH/FUA support such that each FLUSH/FUA request is sequenced individually. The actual FLUSH execution is double buffered and whenever a request wants to execute one for either PRE or POSTFLUSH, it queues on the pending queue. Once certain conditions are met, a flush request is issued and on its completion all pending requests proceed to the next sequence. This allows arbitrary merging of different type of flushes. How they are merged can be primarily controlled and tuned by adjusting the above said 'conditions' used to determine when to issue the next flush. This is inspired by Darrick's patches to merge multiple zero-data flushes which helps workloads with highly concurrent fsync requests. * As flush requests are never put on the IO scheduler, request fields used for flush share space with rq->rb_node. rq->completion_data is moved out of the union. This increases the request size by one pointer. As rq->elevator_private* are used only by the iosched too, it is possible to reduce the request size further. However, to do that, we need to modify request allocation path such that iosched data is not allocated for flush requests. * FLUSH/FUA processing happens on insertion now instead of dispatch. - Comments updated as per Vivek and Mike. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: "Darrick J. Wong" <djwong@us.ibm.com> Cc: Shaohua Li <shli@kernel.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>

block: reimplement FLUSH/FUA to support merge
The current FLUSH/FUA support has evolved from the implementation which had to perform queue draining. As such, sequencing is done queue-wide one flush request after another. However, with the draining requirement gone, there's no reason to keep the queue-wide sequential approach. This patch reimplements FLUSH/FUA support such that each FLUSH/FUA request is sequenced individually. The actual FLUSH execution is double buffered and whenever a request wants to execute one for either PRE or POSTFLUSH, it queues on the pending queue. Once certain conditions are met, a flush request is issued and on its completion all pending requests proceed to the next sequence. This allows arbitrary merging of different type of flushes. How they are merged can be primarily controlled and tuned by adjusting the above said 'conditions' used to determine when to issue the next flush. This is inspired by Darrick's patches to merge multiple zero-data flushes which helps workloads with highly concurrent fsync requests. * As flush requests are never put on the IO scheduler, request fields used for flush share space with rq->rb_node. rq->completion_data is moved out of the union. This increases the request size by one pointer. As rq->elevator_private* are used only by the iosched too, it is possible to reduce the request size further. However, to do that, we need to modify request allocation path such that iosched data is not allocated for flush requests. * FLUSH/FUA processing happens on insertion now instead of dispatch. - Comments updated as per Vivek and Mike. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: "Darrick J. Wong" <djwong@us.ibm.com> Cc: Shaohua Li <shli@kernel.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
ae1b1539 · Tejun Heo · Jens Axboe · 143a87f4 · ae1b1539 · ae1b1539
Commit ae1b1539 authored Jan 25, 2011 by Tejun Heo Committed by Jens Axboe Jan 25, 2011
6 changed files
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -134,8 +134,6 @@ EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
-	struct request_queue *q = rq->q;
 	if (error)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
@@ -159,8 +157,6 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	/* don't actually finish bio if it's part of flush sequence */
 	if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
 		bio_endio(bio, error);
-	else if (error && !q->flush_err)
-		q->flush_err = error;
 }
 void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -519,7 +515,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
-	INIT_LIST_HEAD(&q->pending_flushes);
+	INIT_LIST_HEAD(&q->flush_queue[0]);
+	INIT_LIST_HEAD(&q->flush_queue[1]);
+	INIT_LIST_HEAD(&q->flush_data_in_flight);
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
 	kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1198,7 +1196,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	spin_lock_irq(q->queue_lock);
 	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
-		where = ELEVATOR_INSERT_FRONT;
+		where = ELEVATOR_INSERT_FLUSH;
 		goto get_rq;
 	}

--- a/block/blk-flush.c
+++ b/block/blk-flush.c
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,20 +51,16 @@ static inline void blk_clear_rq_complete(struct request *rq)
 */
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
-struct request *blk_do_flush(struct request_queue *q, struct request *rq);
+void blk_insert_flush(struct request *rq);
+void blk_abort_flushes(struct request_queue *q);
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
 	while (1) {
-		while (!list_empty(&q->queue_head)) {
+		if (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
-			if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
-			    (rq->cmd_flags & REQ_FLUSH_SEQ))
-				return rq;
-			rq = blk_do_flush(q, rq);
-			if (rq)
 			return rq;
 		}

--- a/block/elevator.c
+++ b/block/elevator.c
@@ -673,6 +673,11 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		q->elevator->ops->elevator_add_req_fn(q, rq);
 		break;
+	case ELEVATOR_INSERT_FLUSH:
+		rq->cmd_flags |= REQ_SOFTBARRIER;
+		blk_insert_flush(rq);
+		break;
 	default:
 		printk(KERN_ERR "%s: bad insertion point %d\n",
 		       __func__, where);
@@ -785,6 +790,8 @@ void elv_abort_queue(struct request_queue *q)
 {
 	struct request *rq;
+	blk_abort_flushes(q);
 	while (!list_empty(&q->queue_head)) {
 		rq = list_entry_rq(q->queue_head.next);
 		rq->cmd_flags |= REQ_QUIET;

--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -99,13 +99,18 @@ struct request {
 	/*
 	 * The rb_node is only used inside the io scheduler, requests
 	 * are pruned when moved to the dispatch queue. So let the
-	 * completion_data share space with the rb_node.
+	 * flush fields share space with the rb_node.
 	 */
 	union {
 		struct rb_node rb_node;	/* sort/lookup */
-		void *completion_data;
+		struct {
+			unsigned int			seq;
+			struct list_head		list;
+		} flush;
 	};
+	void *completion_data;
 	/*
 	 * Three pointers are available for the IO schedulers, if they need
 	 * more they have to dynamically allocate it.
@@ -362,11 +367,12 @@ struct request_queue
 	 * for flush operations
 	 */
 	unsigned int		flush_flags;
-	unsigned int		flush_seq;
+	unsigned int		flush_pending_idx:1;
-	int			flush_err;
+	unsigned int		flush_running_idx:1;
+	unsigned long		flush_pending_since;
+	struct list_head	flush_queue[2];
+	struct list_head	flush_data_in_flight;
 	struct request		flush_rq;
-	struct request		*orig_flush_rq;
-	struct list_head	pending_flushes;
 	struct mutex		sysfs_lock;

--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -167,6 +167,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
 #define ELEVATOR_INSERT_BACK	2
 #define ELEVATOR_INSERT_SORT	3
 #define ELEVATOR_INSERT_REQUEUE	4
+#define ELEVATOR_INSERT_FLUSH	5
 /*
 * return values from elevator_may_queue_fn