Merge tag 'io_uring-20190323' of git://git.kernel.dk/linux-block

Pull io_uring fixes and improvements from Jens Axboe: "The first five in this series are heavily inspired by the work Al did on the aio side to fix the races there. The last two re-introduce a feature that was in io_uring before it got merged, but which I pulled since we didn't have a good way to have BVEC iters that already have a stable reference. These aren't necessarily related to block, it's just how io_uring pins fixed buffers" * tag 'io_uring-20190323' of git://git.kernel.dk/linux-block: block: add BIO_NO_PAGE_REF flag iov_iter: add ITER_BVEC_FLAG_NO_REF flag io_uring: mark me as the maintainer io_uring: retry bulk slab allocs as single allocs io_uring: fix poll races io_uring: fix fget/fput handling io_uring: add prepped flag io_uring: make io_read/write return an integer io_uring: use regular request ref counts

Merge tag 'io_uring-20190323' of git://git.kernel.dk/linux-block
Pull io_uring fixes and improvements from Jens Axboe: "The first five in this series are heavily inspired by the work Al did on the aio side to fix the races there. The last two re-introduce a feature that was in io_uring before it got merged, but which I pulled since we didn't have a good way to have BVEC iters that already have a stable reference. These aren't necessarily related to block, it's just how io_uring pins fixed buffers" * tag 'io_uring-20190323' of git://git.kernel.dk/linux-block: block: add BIO_NO_PAGE_REF flag iov_iter: add ITER_BVEC_FLAG_NO_REF flag io_uring: mark me as the maintainer io_uring: retry bulk slab allocs as single allocs io_uring: fix poll races io_uring: fix fget/fput handling io_uring: add prepped flag io_uring: make io_read/write return an integer io_uring: use regular request ref counts
1bdd3dbf · Linus Torvalds · 2335cbe6 · 399254aa · 1bdd3dbf · 1bdd3dbf
Commit 1bdd3dbf authored Mar 23, 2019 by Linus Torvalds
7 changed files
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8096,6 +8096,16 @@ F:	include/linux/iommu.h
 F:	include/linux/of_iommu.h
 F:	include/linux/iova.h
+IO_URING
+M:	Jens Axboe <axboe@kernel.dk>
+L:	linux-block@vger.kernel.org
+L:	linux-fsdevel@vger.kernel.org
+T:	git git://git.kernel.dk/linux-block
+T:	git git://git.kernel.dk/liburing
+S:	Maintained
+F:	fs/io_uring.c
+F:	include/uapi/linux/io_uring.h
 IP MASQUERADING
 M:	Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar>
 S:	Maintained

--- a/block/bio.c
+++ b/block/bio.c
@@ -849,20 +849,14 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
 	size = bio_add_page(bio, bv->bv_page, len,
 				bv->bv_offset + iter->iov_offset);
 	if (size == len) {
-		struct page *page;
+		if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
-		int i;
+			struct page *page;
+			int i;
+			mp_bvec_for_each_page(page, bv, i)
+				get_page(page);
+		}
-		/*
-		 * For the normal O_DIRECT case, we could skip grabbing this
-		 * reference and then not have to put them again when IO
-		 * completes. But this breaks some in-kernel users, like
-		 * splicing to/from a loop device, where we release the pipe
-		 * pages unconditionally. If we can fix that case, we can
-		 * get rid of the get here and the need to call
-		 * bio_release_pages() at IO completion time.
-		 */
-		mp_bvec_for_each_page(page, bv, i)
-			get_page(page);
 		iov_iter_advance(iter, size);
 		return 0;
 	}
@@ -925,10 +919,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 * This takes either an iterator pointing to user memory, or one pointing to
 * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
 * map them into the kernel. On IO completion, the caller should put those
- * pages. For now, when adding kernel pages, we still grab a reference to the
+ * pages. If we're adding kernel pages, and the caller told us it's safe to
- * page. This isn't strictly needed for the common case, but some call paths
+ * do so, we just have to add the pages to the bio directly. We don't grab an
- * end up releasing pages from eg a pipe and we can't easily control these.
+ * extra reference to those pages (the user should already have that), and we
- * See comment in __bio_iov_bvec_add_pages().
+ * don't put the page on IO completion. The caller needs to check if the bio is
+ * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be
+ * released.
 *
 * The function tries, but does not guarantee, to pin as many pages as
 * fit into the bio, or are requested in *iter, whatever is smaller. If
@@ -940,6 +936,13 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	const bool is_bvec = iov_iter_is_bvec(iter);
 	unsigned short orig_vcnt = bio->bi_vcnt;
+	/*
+	 * If this is a BVEC iter, then the pages are kernel pages. Don't
+	 * release them on IO completion, if the caller asked us to.
+	 */
+	if (is_bvec && iov_iter_bvec_no_ref(iter))
+		bio_set_flag(bio, BIO_NO_PAGE_REF);
 	do {
 		int ret;
@@ -1696,7 +1699,8 @@ static void bio_dirty_fn(struct work_struct *work)
 		next = bio->bi_private;
 		bio_set_pages_dirty(bio);
-		bio_release_pages(bio);
+		if (!bio_flagged(bio, BIO_NO_PAGE_REF))
+			bio_release_pages(bio);
 		bio_put(bio);
 	}
 }
@@ -1713,7 +1717,8 @@ void bio_check_pages_dirty(struct bio *bio)
 			goto defer;
 	}
-	bio_release_pages(bio);
+	if (!bio_flagged(bio, BIO_NO_PAGE_REF))
+		bio_release_pages(bio);
 	bio_put(bio);
 	return;
 defer:

--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -336,12 +336,14 @@ static void blkdev_bio_end_io(struct bio *bio)
 	if (should_dirty) {
 		bio_check_pages_dirty(bio);
 	} else {
-		struct bio_vec *bvec;
+		if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
-		int i;
+			struct bvec_iter_all iter_all;
-		struct bvec_iter_all iter_all;
+			struct bio_vec *bvec;
+			int i;
-		bio_for_each_segment_all(bvec, bio, i, iter_all)
+			bio_for_each_segment_all(bvec, bio, i, iter_all)
-			put_page(bvec->bv_page);
+				put_page(bvec->bv_page);
+		}
 		bio_put(bio);
 	}
 }

--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -189,17 +189,28 @@ struct sqe_submit {
 	bool				needs_fixed_file;
 };
+/*
+ * First field must be the file pointer in all the
+ * iocb unions! See also 'struct kiocb' in <linux/fs.h>
+ */
 struct io_poll_iocb {
 	struct file			*file;
 	struct wait_queue_head		*head;
 	__poll_t			events;
-	bool				woken;
+	bool				done;
 	bool				canceled;
 	struct wait_queue_entry		wait;
 };
+/*
+ * NOTE! Each of the iocb union members has the file pointer
+ * as the first entry in their struct definition. So you can
+ * access the file pointer through any of the sub-structs,
+ * or directly as just 'ki_filp' in this struct.
+ */
 struct io_kiocb {
 	union {
+		struct file		*file;
 		struct kiocb		rw;
 		struct io_poll_iocb	poll;
 	};
@@ -214,6 +225,7 @@ struct io_kiocb {
 #define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
 #define REQ_F_FIXED_FILE	4	/* ctx owns file */
 #define REQ_F_SEQ_PREV		8	/* sequential with previous */
+#define REQ_F_PREPPED		16	/* prep already done */
 	u64			user_data;
 	u64			error;
@@ -355,20 +367,25 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 	}
 }
-static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+	if (waitqueue_active(&ctx->wait))
+		wake_up(&ctx->wait);
+	if (waitqueue_active(&ctx->sqo_wait))
+		wake_up(&ctx->sqo_wait);
+}
+static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
 				long res, unsigned ev_flags)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&ctx->completion_lock, flags);
-	io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
+	io_cqring_fill_event(ctx, user_data, res, ev_flags);
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
-	if (waitqueue_active(&ctx->wait))
+	io_cqring_ev_posted(ctx);
-		wake_up(&ctx->wait);
-	if (waitqueue_active(&ctx->sqo_wait))
-		wake_up(&ctx->sqo_wait);
 }
 static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
@@ -382,13 +399,14 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 				   struct io_submit_state *state)
 {
+	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 	struct io_kiocb *req;
 	if (!percpu_ref_tryget(&ctx->refs))
 		return NULL;
 	if (!state) {
-		req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
+		req = kmem_cache_alloc(req_cachep, gfp);
 		if (unlikely(!req))
 			goto out;
 	} else if (!state->free_reqs) {
@@ -396,10 +414,18 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 		int ret;
 		sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
-		ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz,
+		ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
-						state->reqs);
-		if (unlikely(ret <= 0))
+		/*
-			goto out;
+		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
+		 * retry single alloc to be on the safe side.
+		 */
+		if (unlikely(ret <= 0)) {
+			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
+			if (!state->reqs[0])
+				goto out;
+			ret = 1;
+		}
 		state->free_reqs = ret - 1;
 		state->cur_req = 1;
 		req = state->reqs[0];
@@ -411,7 +437,8 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 	req->ctx = ctx;
 	req->flags = 0;
-	refcount_set(&req->refs, 0);
+	/* one is dropped after submission, the other at completion */
+	refcount_set(&req->refs, 2);
 	return req;
 out:
 	io_ring_drop_ctx_refs(ctx, 1);
@@ -429,10 +456,16 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 static void io_free_req(struct io_kiocb *req)
 {
-	if (!refcount_read(&req->refs) || refcount_dec_and_test(&req->refs)) {
+	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
-		io_ring_drop_ctx_refs(req->ctx, 1);
+		fput(req->file);
-		kmem_cache_free(req_cachep, req);
+	io_ring_drop_ctx_refs(req->ctx, 1);
-	}
+	kmem_cache_free(req_cachep, req);
+}
+static void io_put_req(struct io_kiocb *req)
+{
+	if (refcount_dec_and_test(&req->refs))
+		io_free_req(req);
 }
 /*
@@ -442,44 +475,34 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 			       struct list_head *done)
 {
 	void *reqs[IO_IOPOLL_BATCH];
-	int file_count, to_free;
-	struct file *file = NULL;
 	struct io_kiocb *req;
+	int to_free;
-	file_count = to_free = 0;
+	to_free = 0;
 	while (!list_empty(done)) {
 		req = list_first_entry(done, struct io_kiocb, list);
 		list_del(&req->list);
 		io_cqring_fill_event(ctx, req->user_data, req->error, 0);
-		reqs[to_free++] = req;
 		(*nr_events)++;
-		/*
+		if (refcount_dec_and_test(&req->refs)) {
-		 * Batched puts of the same file, to avoid dirtying the
+			/* If we're not using fixed files, we have to pair the
-		 * file usage count multiple times, if avoidable.
+			 * completion part with the file put. Use regular
-		 */
+			 * completions for those, only batch free for fixed
-		if (!(req->flags & REQ_F_FIXED_FILE)) {
+			 * file.
-			if (!file) {
+			 */
-				file = req->rw.ki_filp;
+			if (req->flags & REQ_F_FIXED_FILE) {
-				file_count = 1;
+				reqs[to_free++] = req;
-			} else if (file == req->rw.ki_filp) {
+				if (to_free == ARRAY_SIZE(reqs))
-				file_count++;
+					io_free_req_many(ctx, reqs, &to_free);
 			} else {
-				fput_many(file, file_count);
+				io_free_req(req);
-				file = req->rw.ki_filp;
-				file_count = 1;
 			}
 		}
-		if (to_free == ARRAY_SIZE(reqs))
-			io_free_req_many(ctx, reqs, &to_free);
 	}
-	io_commit_cqring(ctx);
-	if (file)
+	io_commit_cqring(ctx);
-		fput_many(file, file_count);
 	io_free_req_many(ctx, reqs, &to_free);
 }
@@ -602,21 +625,14 @@ static void kiocb_end_write(struct kiocb *kiocb)
 	}
 }
-static void io_fput(struct io_kiocb *req)
-{
-	if (!(req->flags & REQ_F_FIXED_FILE))
-		fput(req->rw.ki_filp);
-}
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 	kiocb_end_write(kiocb);
-	io_fput(req);
 	io_cqring_add_event(req->ctx, req->user_data, res, 0);
-	io_free_req(req);
+	io_put_req(req);
 }
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -731,31 +747,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 	const struct io_uring_sqe *sqe = s->sqe;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw;
-	unsigned ioprio, flags;
+	unsigned ioprio;
-	int fd, ret;
+	int ret;
+	if (!req->file)
+		return -EBADF;
 	/* For -EAGAIN retry, everything is already prepped */
-	if (kiocb->ki_filp)
+	if (req->flags & REQ_F_PREPPED)
 		return 0;
-	flags = READ_ONCE(sqe->flags);
+	if (force_nonblock && !io_file_supports_async(req->file))
-	fd = READ_ONCE(sqe->fd);
+		force_nonblock = false;
-	if (flags & IOSQE_FIXED_FILE) {
-		if (unlikely(!ctx->user_files ||
-		    (unsigned) fd >= ctx->nr_user_files))
-			return -EBADF;
-		kiocb->ki_filp = ctx->user_files[fd];
-		req->flags |= REQ_F_FIXED_FILE;
-	} else {
-		if (s->needs_fixed_file)
-			return -EBADF;
-		kiocb->ki_filp = io_file_get(state, fd);
-		if (unlikely(!kiocb->ki_filp))
-			return -EBADF;
-		if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
-			force_nonblock = false;
-	}
 	kiocb->ki_pos = READ_ONCE(sqe->off);
 	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
@@ -764,7 +767,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 	if (ioprio) {
 		ret = ioprio_check_cap(ioprio);
 		if (ret)
-			goto out_fput;
+			return ret;
 		kiocb->ki_ioprio = ioprio;
 	} else
@@ -772,38 +775,26 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
 	if (unlikely(ret))
-		goto out_fput;
+		return ret;
 	if (force_nonblock) {
 		kiocb->ki_flags |= IOCB_NOWAIT;
 		req->flags |= REQ_F_FORCE_NONBLOCK;
 	}
 	if (ctx->flags & IORING_SETUP_IOPOLL) {
-		ret = -EOPNOTSUPP;
 		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
 		    !kiocb->ki_filp->f_op->iopoll)
-			goto out_fput;
+			return -EOPNOTSUPP;
 		req->error = 0;
 		kiocb->ki_flags |= IOCB_HIPRI;
 		kiocb->ki_complete = io_complete_rw_iopoll;
 	} else {
-		if (kiocb->ki_flags & IOCB_HIPRI) {
+		if (kiocb->ki_flags & IOCB_HIPRI)
-			ret = -EINVAL;
+			return -EINVAL;
-			goto out_fput;
-		}
 		kiocb->ki_complete = io_complete_rw;
 	}
+	req->flags |= REQ_F_PREPPED;
 	return 0;
-out_fput:
-	if (!(flags & IOSQE_FIXED_FILE)) {
-		/*
-		 * in case of error, we didn't use this file reference. drop it.
-		 */
-		if (state)
-			state->used_refs--;
-		io_file_put(state, kiocb->ki_filp);
-	}
-	return ret;
 }
 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
@@ -864,6 +855,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
 	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
 	if (offset)
 		iov_iter_advance(iter, offset);
+	/* don't drop a reference to these pages */
+	iter->type |= ITER_BVEC_FLAG_NO_REF;
 	return 0;
 }
@@ -887,7 +881,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
 	opcode = READ_ONCE(sqe->opcode);
 	if (opcode == IORING_OP_READ_FIXED ||
 	    opcode == IORING_OP_WRITE_FIXED) {
-		ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+		int ret = io_import_fixed(ctx, rw, sqe, iter);
 		*iovec = NULL;
 		return ret;
 	}
@@ -945,31 +939,29 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
 	async_list->io_end = io_end;
 }
-static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
+static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
-		       bool force_nonblock, struct io_submit_state *state)
+		   bool force_nonblock, struct io_submit_state *state)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw;
 	struct iov_iter iter;
 	struct file *file;
 	size_t iov_count;
-	ssize_t ret;
+	int ret;
 	ret = io_prep_rw(req, s, force_nonblock, state);
 	if (ret)
 		return ret;
 	file = kiocb->ki_filp;
-	ret = -EBADF;
 	if (unlikely(!(file->f_mode & FMODE_READ)))
-		goto out_fput;
+		return -EBADF;
-	ret = -EINVAL;
 	if (unlikely(!file->f_op->read_iter))
-		goto out_fput;
+		return -EINVAL;
 	ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
 	if (ret)
-		goto out_fput;
+		return ret;
 	iov_count = iov_iter_count(&iter);
 	ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
@@ -991,38 +983,32 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
 		}
 	}
 	kfree(iovec);
-out_fput:
-	/* Hold on to the file for -EAGAIN */
-	if (unlikely(ret && ret != -EAGAIN))
-		io_fput(req);
 	return ret;
 }
-static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
+static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
-			bool force_nonblock, struct io_submit_state *state)
+		    bool force_nonblock, struct io_submit_state *state)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw;
 	struct iov_iter iter;
 	struct file *file;
 	size_t iov_count;
-	ssize_t ret;
+	int ret;
 	ret = io_prep_rw(req, s, force_nonblock, state);
 	if (ret)
 		return ret;
-	ret = -EBADF;
 	file = kiocb->ki_filp;
 	if (unlikely(!(file->f_mode & FMODE_WRITE)))
-		goto out_fput;
+		return -EBADF;
-	ret = -EINVAL;
 	if (unlikely(!file->f_op->write_iter))
-		goto out_fput;
+		return -EINVAL;
 	ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
 	if (ret)
-		goto out_fput;
+		return ret;
 	iov_count = iov_iter_count(&iter);
@@ -1054,10 +1040,6 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	}
 out_free:
 	kfree(iovec);
-out_fput:
-	/* Hold on to the file for -EAGAIN */
-	if (unlikely(ret && ret != -EAGAIN))
-		io_fput(req);
 	return ret;
 }
@@ -1072,29 +1054,19 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	/*
-	 * Twilight zone - it's possible that someone issued an opcode that
-	 * has a file attached, then got -EAGAIN on submission, and changed
-	 * the sqe before we retried it from async context. Avoid dropping
-	 * a file reference for this malicious case, and flag the error.
-	 */
-	if (req->rw.ki_filp) {
-		err = -EBADF;
-		io_fput(req);
-	}
 	io_cqring_add_event(ctx, user_data, err, 0);
-	io_free_req(req);
+	io_put_req(req);
 	return 0;
 }
 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	unsigned flags;
-	int fd;
-	/* Prep already done */
+	if (!req->file)
-	if (req->rw.ki_filp)
+		return -EBADF;
+	/* Prep already done (EAGAIN retry) */
+	if (req->flags & REQ_F_PREPPED)
 		return 0;
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
@@ -1102,20 +1074,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
 		return -EINVAL;
-	fd = READ_ONCE(sqe->fd);
+	req->flags |= REQ_F_PREPPED;
-	flags = READ_ONCE(sqe->flags);
-	if (flags & IOSQE_FIXED_FILE) {
-		if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
-			return -EBADF;
-		req->rw.ki_filp = ctx->user_files[fd];
-		req->flags |= REQ_F_FIXED_FILE;
-	} else {
-		req->rw.ki_filp = fget(fd);
-		if (unlikely(!req->rw.ki_filp))
-			return -EBADF;
-	}
 	return 0;
 }
@@ -1144,9 +1103,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 				end > 0 ? end : LLONG_MAX,
 				fsync_flags & IORING_FSYNC_DATASYNC);
-	io_fput(req);
 	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
-	io_free_req(req);
+	io_put_req(req);
 	return 0;
 }
@@ -1204,15 +1162,16 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	spin_unlock_irq(&ctx->completion_lock);
 	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
-	io_free_req(req);
+	io_put_req(req);
 	return 0;
 }
-static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
+static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
+			     __poll_t mask)
 {
-	io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0);
+	req->poll.done = true;
-	io_fput(req);
+	io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0);
-	io_free_req(req);
+	io_commit_cqring(ctx);
 }
 static void io_poll_complete_work(struct work_struct *work)
@@ -1240,9 +1199,11 @@ static void io_poll_complete_work(struct work_struct *work)
 		return;
 	}
 	list_del_init(&req->list);
+	io_poll_complete(ctx, req, mask);
 	spin_unlock_irq(&ctx->completion_lock);
-	io_poll_complete(req, mask);
+	io_cqring_ev_posted(ctx);
+	io_put_req(req);
 }
 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -1253,29 +1214,25 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
 	struct io_ring_ctx *ctx = req->ctx;
 	__poll_t mask = key_to_poll(key);
+	unsigned long flags;
-	poll->woken = true;
 	/* for instances that support it check for an event match first: */
-	if (mask) {
+	if (mask && !(mask & poll->events))
-		unsigned long flags;
+		return 0;
-		if (!(mask & poll->events))
+	list_del_init(&poll->wait.entry);
-			return 0;
-		/* try to complete the iocb inline if we can: */
+	if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
-		if (spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+		list_del(&req->list);
-			list_del(&req->list);
+		io_poll_complete(ctx, req, mask);
-			spin_unlock_irqrestore(&ctx->completion_lock, flags);
+		spin_unlock_irqrestore(&ctx->completion_lock, flags);
-			list_del_init(&poll->wait.entry);
+		io_cqring_ev_posted(ctx);
-			io_poll_complete(req, mask);
+		io_put_req(req);
-			return 1;
+	} else {
-		}
+		queue_work(ctx->sqo_wq, &req->work);
 	}
-	list_del_init(&poll->wait.entry);
-	queue_work(ctx->sqo_wq, &req->work);
 	return 1;
 }
@@ -1305,36 +1262,23 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	struct io_poll_iocb *poll = &req->poll;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_poll_table ipt;
-	unsigned flags;
+	bool cancel = false;
 	__poll_t mask;
 	u16 events;
-	int fd;
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 	if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
 		return -EINVAL;
+	if (!poll->file)
+		return -EBADF;
 	INIT_WORK(&req->work, io_poll_complete_work);
 	events = READ_ONCE(sqe->poll_events);
 	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
-	flags = READ_ONCE(sqe->flags);
-	fd = READ_ONCE(sqe->fd);
-	if (flags & IOSQE_FIXED_FILE) {
-		if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
-			return -EBADF;
-		poll->file = ctx->user_files[fd];
-		req->flags |= REQ_F_FIXED_FILE;
-	} else {
-		poll->file = fget(fd);
-	}
-	if (unlikely(!poll->file))
-		return -EBADF;
 	poll->head = NULL;
-	poll->woken = false;
+	poll->done = false;
 	poll->canceled = false;
 	ipt.pt._qproc = io_poll_queue_proc;
@@ -1346,56 +1290,44 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	INIT_LIST_HEAD(&poll->wait.entry);
 	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
-	/* one for removal from waitqueue, one for this function */
-	refcount_set(&req->refs, 2);
 	mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
-	if (unlikely(!poll->head)) {
-		/* we did not manage to set up a waitqueue, done */
-		goto out;
-	}
 	spin_lock_irq(&ctx->completion_lock);
-	spin_lock(&poll->head->lock);
+	if (likely(poll->head)) {
-	if (poll->woken) {
+		spin_lock(&poll->head->lock);
-		/* wake_up context handles the rest */
+		if (unlikely(list_empty(&poll->wait.entry))) {
-		mask = 0;
+			if (ipt.error)
+				cancel = true;
+			ipt.error = 0;
+			mask = 0;
+		}
+		if (mask || ipt.error)
+			list_del_init(&poll->wait.entry);
+		else if (cancel)
+			WRITE_ONCE(poll->canceled, true);
+		else if (!poll->done) /* actually waiting for an event */
+			list_add_tail(&req->list, &ctx->cancel_list);
+		spin_unlock(&poll->head->lock);
+	}
+	if (mask) { /* no async, we'd stolen it */
+		req->error = mangle_poll(mask);
 		ipt.error = 0;
-	} else if (mask || ipt.error) {
+		io_poll_complete(ctx, req, mask);
-		/* if we get an error or a mask we are done */
-		WARN_ON_ONCE(list_empty(&poll->wait.entry));
-		list_del_init(&poll->wait.entry);
-	} else {
-		/* actually waiting for an event */
-		list_add_tail(&req->list, &ctx->cancel_list);
 	}
-	spin_unlock(&poll->head->lock);
 	spin_unlock_irq(&ctx->completion_lock);
-out:
+	if (mask) {
-	if (unlikely(ipt.error)) {
+		io_cqring_ev_posted(ctx);
-		if (!(flags & IOSQE_FIXED_FILE))
+		io_put_req(req);
-			fput(poll->file);
-		/*
-		 * Drop one of our refs to this req, __io_submit_sqe() will
-		 * drop the other one since we're returning an error.
-		 */
-		io_free_req(req);
-		return ipt.error;
 	}
+	return ipt.error;
-	if (mask)
-		io_poll_complete(req, mask);
-	io_free_req(req);
-	return 0;
 }
 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			   const struct sqe_submit *s, bool force_nonblock,
 			   struct io_submit_state *state)
 {
-	ssize_t ret;
+	int ret, opcode;
-	int opcode;
 	if (unlikely(s->index >= ctx->sq_entries))
 		return -EINVAL;
@@ -1524,10 +1456,13 @@ static void io_sq_wq_submit_work(struct work_struct *work)
 					break;
 				cond_resched();
 			} while (1);
+			/* drop submission reference */
+			io_put_req(req);
 		}
 		if (ret) {
 			io_cqring_add_event(ctx, sqe->user_data, ret, 0);
-			io_free_req(req);
+			io_put_req(req);
 		}
 		/* async context always use a copy of the sqe */
@@ -1614,11 +1549,55 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
 	return ret;
 }
+static bool io_op_needs_file(const struct io_uring_sqe *sqe)
+{
+	int op = READ_ONCE(sqe->opcode);
+	switch (op) {
+	case IORING_OP_NOP:
+	case IORING_OP_POLL_REMOVE:
+		return false;
+	default:
+		return true;
+	}
+}
+static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
+			   struct io_submit_state *state, struct io_kiocb *req)
+{
+	unsigned flags;
+	int fd;
+	flags = READ_ONCE(s->sqe->flags);
+	fd = READ_ONCE(s->sqe->fd);
+	if (!io_op_needs_file(s->sqe)) {
+		req->file = NULL;
+		return 0;
+	}
+	if (flags & IOSQE_FIXED_FILE) {
+		if (unlikely(!ctx->user_files ||
+		    (unsigned) fd >= ctx->nr_user_files))
+			return -EBADF;
+		req->file = ctx->user_files[fd];
+		req->flags |= REQ_F_FIXED_FILE;
+	} else {
+		if (s->needs_fixed_file)
+			return -EBADF;
+		req->file = io_file_get(state, fd);
+		if (unlikely(!req->file))
+			return -EBADF;
+	}
+	return 0;
+}
 static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 			 struct io_submit_state *state)
 {
 	struct io_kiocb *req;
-	ssize_t ret;
+	int ret;
 	/* enforce forwards compatibility on users */
 	if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
@@ -1628,7 +1607,9 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 	if (unlikely(!req))
 		return -EAGAIN;
-	req->rw.ki_filp = NULL;
+	ret = io_req_set_file(ctx, s, state, req);
+	if (unlikely(ret))
+		goto out;
 	ret = __io_submit_sqe(ctx, req, s, true, state);
 	if (ret == -EAGAIN) {
@@ -1649,11 +1630,23 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 				INIT_WORK(&req->work, io_sq_wq_submit_work);
 				queue_work(ctx->sqo_wq, &req->work);
 			}
-			ret = 0;
+			/*
+			 * Queued up for async execution, worker will release
+			 * submit reference when the iocb is actually
+			 * submitted.
+			 */
+			return 0;
 		}
 	}
+out:
+	/* drop submission reference */
+	io_put_req(req);
+	/* and drop final reference, if we failed */
 	if (ret)
-		io_free_req(req);
+		io_put_req(req);
 	return ret;
 }

--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1589,12 +1589,14 @@ static void iomap_dio_bio_end_io(struct bio *bio)
 	if (should_dirty) {
 		bio_check_pages_dirty(bio);
 	} else {
-		struct bio_vec *bvec;
+		if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
-		int i;
+			struct bvec_iter_all iter_all;
-		struct bvec_iter_all iter_all;
+			struct bio_vec *bvec;
+			int i;
-		bio_for_each_segment_all(bvec, bio, i, iter_all)
+			bio_for_each_segment_all(bvec, bio, i, iter_all)
-			put_page(bvec->bv_page);
+				put_page(bvec->bv_page);
+		}
 		bio_put(bio);
 	}
 }

--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -215,6 +215,7 @@ struct bio {
 /*
 * bio flags
 */
+#define BIO_NO_PAGE_REF	0	/* don't put release vec pages */
 #define BIO_SEG_VALID	1	/* bi_phys_segments valid */
 #define BIO_CLONED	2	/* doesn't own data */
 #define BIO_BOUNCED	3	/* bio is a bounce bio */

--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -23,14 +23,23 @@ struct kvec {
 };
 enum iter_type {
-	ITER_IOVEC = 0,
+	/* set if ITER_BVEC doesn't hold a bv_page ref */
-	ITER_KVEC = 2,
+	ITER_BVEC_FLAG_NO_REF = 2,
-	ITER_BVEC = 4,
-	ITER_PIPE = 8,
+	/* iter types */
-	ITER_DISCARD = 16,
+	ITER_IOVEC = 4,
+	ITER_KVEC = 8,
+	ITER_BVEC = 16,
+	ITER_PIPE = 32,
+	ITER_DISCARD = 64,
 };
 struct iov_iter {
+	/*
+	 * Bit 0 is the read/write bit, set if we're writing.
+	 * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and
+	 * the caller isn't expecting to drop a page reference when done.
+	 */
 	unsigned int type;
 	size_t iov_offset;
 	size_t count;
@@ -84,6 +93,11 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 	return i->type & (READ | WRITE);
 }
+static inline bool iov_iter_bvec_no_ref(const struct iov_iter *i)
+{
+	return (i->type & ITER_BVEC_FLAG_NO_REF) != 0;
+}
 /*
 * Total number of bytes covered by an iovec.
 *