Commit 0d21e668 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'aio-poll-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux

Pull aio poll fixes from Eric Biggers:
 "Fix three bugs in aio poll, and one issue with POLLFREE more broadly:

   - aio poll didn't handle POLLFREE, causing a use-after-free.

   - aio poll could block while the file is ready.

   - aio poll called eventfd_signal() when it isn't allowed.

   - POLLFREE didn't handle multiple exclusive waiters correctly.

  This has been tested with the libaio test suite, as well as with test
  programs I wrote that reproduce the first two bugs. I am sending this
  pull request myself as no one seems to be maintaining this code"

* tag 'aio-poll-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux:
  aio: Fix incorrect usage of eventfd_signal_allowed()
  aio: fix use-after-free due to missing POLLFREE handling
  aio: keep poll requests on waitqueue until completed
  signalfd: use wake_up_pollfree()
  binder: use wake_up_pollfree()
  wait: add wake_up_pollfree()
parents b9172f9e 4b374986
...@@ -4422,23 +4422,20 @@ static int binder_thread_release(struct binder_proc *proc, ...@@ -4422,23 +4422,20 @@ static int binder_thread_release(struct binder_proc *proc,
__release(&t->lock); __release(&t->lock);
/* /*
* If this thread used poll, make sure we remove the waitqueue * If this thread used poll, make sure we remove the waitqueue from any
* from any epoll data structures holding it with POLLFREE. * poll data structures holding it.
* waitqueue_active() is safe to use here because we're holding
* the inner lock.
*/ */
if ((thread->looper & BINDER_LOOPER_STATE_POLL) && if (thread->looper & BINDER_LOOPER_STATE_POLL)
waitqueue_active(&thread->wait)) { wake_up_pollfree(&thread->wait);
wake_up_poll(&thread->wait, EPOLLHUP | POLLFREE);
}
binder_inner_proc_unlock(thread->proc); binder_inner_proc_unlock(thread->proc);
/* /*
* This is needed to avoid races between wake_up_poll() above and * This is needed to avoid races between wake_up_pollfree() above and
* and ep_remove_waitqueue() called for other reasons (eg the epoll file * someone else removing the last entry from the queue for other reasons
* descriptor being closed); ep_remove_waitqueue() holds an RCU read * (e.g. ep_remove_wait_queue() being called due to an epoll file
* lock, so we can be sure it's done after calling synchronize_rcu(). * descriptor being closed). Such other users hold an RCU read lock, so
* we can be sure they're done after we call synchronize_rcu().
*/ */
if (thread->looper & BINDER_LOOPER_STATE_POLL) if (thread->looper & BINDER_LOOPER_STATE_POLL)
synchronize_rcu(); synchronize_rcu();
......
...@@ -181,8 +181,9 @@ struct poll_iocb { ...@@ -181,8 +181,9 @@ struct poll_iocb {
struct file *file; struct file *file;
struct wait_queue_head *head; struct wait_queue_head *head;
__poll_t events; __poll_t events;
bool done;
bool cancelled; bool cancelled;
bool work_scheduled;
bool work_need_resched;
struct wait_queue_entry wait; struct wait_queue_entry wait;
struct work_struct work; struct work_struct work;
}; };
...@@ -1619,6 +1620,51 @@ static void aio_poll_put_work(struct work_struct *work) ...@@ -1619,6 +1620,51 @@ static void aio_poll_put_work(struct work_struct *work)
iocb_put(iocb); iocb_put(iocb);
} }
/*
* Safely lock the waitqueue which the request is on, synchronizing with the
* case where the ->poll() provider decides to free its waitqueue early.
*
* Returns true on success, meaning that req->head->lock was locked, req->wait
* is on req->head, and an RCU read lock was taken. Returns false if the
* request was already removed from its waitqueue (which might no longer exist).
*/
static bool poll_iocb_lock_wq(struct poll_iocb *req)
{
wait_queue_head_t *head;
/*
* While we hold the waitqueue lock and the waitqueue is nonempty,
* wake_up_pollfree() will wait for us. However, taking the waitqueue
* lock in the first place can race with the waitqueue being freed.
*
* We solve this as eventpoll does: by taking advantage of the fact that
* all users of wake_up_pollfree() will RCU-delay the actual free. If
* we enter rcu_read_lock() and see that the pointer to the queue is
* non-NULL, we can then lock it without the memory being freed out from
* under us, then check whether the request is still on the queue.
*
* Keep holding rcu_read_lock() as long as we hold the queue lock, in
* case the caller deletes the entry from the queue, leaving it empty.
* In that case, only RCU prevents the queue memory from being freed.
*/
rcu_read_lock();
head = smp_load_acquire(&req->head);
if (head) {
spin_lock(&head->lock);
if (!list_empty(&req->wait.entry))
return true;
spin_unlock(&head->lock);
}
rcu_read_unlock();
return false;
}
static void poll_iocb_unlock_wq(struct poll_iocb *req)
{
spin_unlock(&req->head->lock);
rcu_read_unlock();
}
static void aio_poll_complete_work(struct work_struct *work) static void aio_poll_complete_work(struct work_struct *work)
{ {
struct poll_iocb *req = container_of(work, struct poll_iocb, work); struct poll_iocb *req = container_of(work, struct poll_iocb, work);
...@@ -1638,14 +1684,27 @@ static void aio_poll_complete_work(struct work_struct *work) ...@@ -1638,14 +1684,27 @@ static void aio_poll_complete_work(struct work_struct *work)
* avoid further branches in the fast path. * avoid further branches in the fast path.
*/ */
spin_lock_irq(&ctx->ctx_lock); spin_lock_irq(&ctx->ctx_lock);
if (!mask && !READ_ONCE(req->cancelled)) { if (poll_iocb_lock_wq(req)) {
add_wait_queue(req->head, &req->wait); if (!mask && !READ_ONCE(req->cancelled)) {
spin_unlock_irq(&ctx->ctx_lock); /*
return; * The request isn't actually ready to be completed yet.
} * Reschedule completion if another wakeup came in.
*/
if (req->work_need_resched) {
schedule_work(&req->work);
req->work_need_resched = false;
} else {
req->work_scheduled = false;
}
poll_iocb_unlock_wq(req);
spin_unlock_irq(&ctx->ctx_lock);
return;
}
list_del_init(&req->wait.entry);
poll_iocb_unlock_wq(req);
} /* else, POLLFREE has freed the waitqueue, so we must complete */
list_del_init(&iocb->ki_list); list_del_init(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask); iocb->ki_res.res = mangle_poll(mask);
req->done = true;
spin_unlock_irq(&ctx->ctx_lock); spin_unlock_irq(&ctx->ctx_lock);
iocb_put(iocb); iocb_put(iocb);
...@@ -1657,13 +1716,14 @@ static int aio_poll_cancel(struct kiocb *iocb) ...@@ -1657,13 +1716,14 @@ static int aio_poll_cancel(struct kiocb *iocb)
struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
struct poll_iocb *req = &aiocb->poll; struct poll_iocb *req = &aiocb->poll;
spin_lock(&req->head->lock); if (poll_iocb_lock_wq(req)) {
WRITE_ONCE(req->cancelled, true); WRITE_ONCE(req->cancelled, true);
if (!list_empty(&req->wait.entry)) { if (!req->work_scheduled) {
list_del_init(&req->wait.entry); schedule_work(&aiocb->poll.work);
schedule_work(&aiocb->poll.work); req->work_scheduled = true;
} }
spin_unlock(&req->head->lock); poll_iocb_unlock_wq(req);
} /* else, the request was force-cancelled by POLLFREE already */
return 0; return 0;
} }
...@@ -1680,21 +1740,27 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, ...@@ -1680,21 +1740,27 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (mask && !(mask & req->events)) if (mask && !(mask & req->events))
return 0; return 0;
list_del_init(&req->wait.entry); /*
* Complete the request inline if possible. This requires that three
if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { * conditions be met:
* 1. An event mask must have been passed. If a plain wakeup was done
* instead, then mask == 0 and we have to call vfs_poll() to get
* the events, so inline completion isn't possible.
* 2. The completion work must not have already been scheduled.
* 3. ctx_lock must not be busy. We have to use trylock because we
* already hold the waitqueue lock, so this inverts the normal
* locking order. Use irqsave/irqrestore because not all
* filesystems (e.g. fuse) call this function with IRQs disabled,
* yet IRQs have to be disabled before ctx_lock is obtained.
*/
if (mask && !req->work_scheduled &&
spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
struct kioctx *ctx = iocb->ki_ctx; struct kioctx *ctx = iocb->ki_ctx;
/* list_del_init(&req->wait.entry);
* Try to complete the iocb inline if we can. Use
* irqsave/irqrestore because not all filesystems (e.g. fuse)
* call this function with IRQs disabled and because IRQs
* have to be disabled before ctx_lock is obtained.
*/
list_del(&iocb->ki_list); list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask); iocb->ki_res.res = mangle_poll(mask);
req->done = true; if (iocb->ki_eventfd && !eventfd_signal_allowed()) {
if (iocb->ki_eventfd && eventfd_signal_allowed()) {
iocb = NULL; iocb = NULL;
INIT_WORK(&req->work, aio_poll_put_work); INIT_WORK(&req->work, aio_poll_put_work);
schedule_work(&req->work); schedule_work(&req->work);
...@@ -1703,7 +1769,43 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, ...@@ -1703,7 +1769,43 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (iocb) if (iocb)
iocb_put(iocb); iocb_put(iocb);
} else { } else {
schedule_work(&req->work); /*
* Schedule the completion work if needed. If it was already
* scheduled, record that another wakeup came in.
*
* Don't remove the request from the waitqueue here, as it might
* not actually be complete yet (we won't know until vfs_poll()
* is called), and we must not miss any wakeups. POLLFREE is an
* exception to this; see below.
*/
if (req->work_scheduled) {
req->work_need_resched = true;
} else {
schedule_work(&req->work);
req->work_scheduled = true;
}
/*
* If the waitqueue is being freed early but we can't complete
* the request inline, we have to tear down the request as best
* we can. That means immediately removing the request from its
* waitqueue and preventing all further accesses to the
* waitqueue via the request. We also need to schedule the
* completion work (done above). Also mark the request as
* cancelled, to potentially skip an unneeded call to ->poll().
*/
if (mask & POLLFREE) {
WRITE_ONCE(req->cancelled, true);
list_del_init(&req->wait.entry);
/*
* Careful: this *must* be the last step, since as soon
* as req->head is NULL'ed out, the request can be
* completed and freed, since aio_poll_complete_work()
* will no longer need to take the waitqueue lock.
*/
smp_store_release(&req->head, NULL);
}
} }
return 1; return 1;
} }
...@@ -1711,6 +1813,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, ...@@ -1711,6 +1813,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct aio_poll_table { struct aio_poll_table {
struct poll_table_struct pt; struct poll_table_struct pt;
struct aio_kiocb *iocb; struct aio_kiocb *iocb;
bool queued;
int error; int error;
}; };
...@@ -1721,11 +1824,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, ...@@ -1721,11 +1824,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt); struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
/* multiple wait queues per file are not supported */ /* multiple wait queues per file are not supported */
if (unlikely(pt->iocb->poll.head)) { if (unlikely(pt->queued)) {
pt->error = -EINVAL; pt->error = -EINVAL;
return; return;
} }
pt->queued = true;
pt->error = 0; pt->error = 0;
pt->iocb->poll.head = head; pt->iocb->poll.head = head;
add_wait_queue(head, &pt->iocb->poll.wait); add_wait_queue(head, &pt->iocb->poll.wait);
...@@ -1750,12 +1854,14 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) ...@@ -1750,12 +1854,14 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
req->head = NULL; req->head = NULL;
req->done = false;
req->cancelled = false; req->cancelled = false;
req->work_scheduled = false;
req->work_need_resched = false;
apt.pt._qproc = aio_poll_queue_proc; apt.pt._qproc = aio_poll_queue_proc;
apt.pt._key = req->events; apt.pt._key = req->events;
apt.iocb = aiocb; apt.iocb = aiocb;
apt.queued = false;
apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
/* initialized the list so that we can do list_empty checks */ /* initialized the list so that we can do list_empty checks */
...@@ -1764,23 +1870,35 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) ...@@ -1764,23 +1870,35 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
mask = vfs_poll(req->file, &apt.pt) & req->events; mask = vfs_poll(req->file, &apt.pt) & req->events;
spin_lock_irq(&ctx->ctx_lock); spin_lock_irq(&ctx->ctx_lock);
if (likely(req->head)) { if (likely(apt.queued)) {
spin_lock(&req->head->lock); bool on_queue = poll_iocb_lock_wq(req);
if (unlikely(list_empty(&req->wait.entry))) {
if (apt.error) if (!on_queue || req->work_scheduled) {
/*
* aio_poll_wake() already either scheduled the async
* completion work, or completed the request inline.
*/
if (apt.error) /* unsupported case: multiple queues */
cancel = true; cancel = true;
apt.error = 0; apt.error = 0;
mask = 0; mask = 0;
} }
if (mask || apt.error) { if (mask || apt.error) {
/* Steal to complete synchronously. */
list_del_init(&req->wait.entry); list_del_init(&req->wait.entry);
} else if (cancel) { } else if (cancel) {
/* Cancel if possible (may be too late though). */
WRITE_ONCE(req->cancelled, true); WRITE_ONCE(req->cancelled, true);
} else if (!req->done) { /* actually waiting for an event */ } else if (on_queue) {
/*
* Actually waiting for an event, so add the request to
* active_reqs so that it can be cancelled if needed.
*/
list_add_tail(&aiocb->ki_list, &ctx->active_reqs); list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
aiocb->ki_cancel = aio_poll_cancel; aiocb->ki_cancel = aio_poll_cancel;
} }
spin_unlock(&req->head->lock); if (on_queue)
poll_iocb_unlock_wq(req);
} }
if (mask) { /* no async, we'd stolen it */ if (mask) { /* no async, we'd stolen it */
aiocb->ki_res.res = mangle_poll(mask); aiocb->ki_res.res = mangle_poll(mask);
......
...@@ -35,17 +35,7 @@ ...@@ -35,17 +35,7 @@
void signalfd_cleanup(struct sighand_struct *sighand) void signalfd_cleanup(struct sighand_struct *sighand)
{ {
wait_queue_head_t *wqh = &sighand->signalfd_wqh; wake_up_pollfree(&sighand->signalfd_wqh);
/*
* The lockless check can race with remove_wait_queue() in progress,
* but in this case its caller should run under rcu_read_lock() and
* sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return.
*/
if (likely(!waitqueue_active(wqh)))
return;
/* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */
wake_up_poll(wqh, EPOLLHUP | POLLFREE);
} }
struct signalfd_ctx { struct signalfd_ctx {
......
...@@ -217,6 +217,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void ...@@ -217,6 +217,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
void __wake_up_pollfree(struct wait_queue_head *wq_head);
#define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL)
...@@ -245,6 +246,31 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); ...@@ -245,6 +246,31 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
#define wake_up_interruptible_sync_poll_locked(x, m) \ #define wake_up_interruptible_sync_poll_locked(x, m) \
__wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
/**
* wake_up_pollfree - signal that a polled waitqueue is going away
* @wq_head: the wait queue head
*
* In the very rare cases where a ->poll() implementation uses a waitqueue whose
* lifetime is tied to a task rather than to the 'struct file' being polled,
* this function must be called before the waitqueue is freed so that
* non-blocking polls (e.g. epoll) are notified that the queue is going away.
*
* The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via
* an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU.
*/
static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
{
/*
* For performance reasons, we don't always take the queue lock here.
* Therefore, we might race with someone removing the last entry from
* the queue, and proceed while they still hold the queue lock.
* However, rcu_read_lock() is required to be held in such cases, so we
* can safely proceed with an RCU-delayed free.
*/
if (waitqueue_active(wq_head))
__wake_up_pollfree(wq_head);
}
#define ___wait_cond_timeout(condition) \ #define ___wait_cond_timeout(condition) \
({ \ ({ \
bool __cond = (condition); \ bool __cond = (condition); \
......
...@@ -29,7 +29,7 @@ ...@@ -29,7 +29,7 @@
#define POLLRDHUP 0x2000 #define POLLRDHUP 0x2000
#endif #endif
#define POLLFREE (__force __poll_t)0x4000 /* currently only for epoll */ #define POLLFREE (__force __poll_t)0x4000
#define POLL_BUSY_LOOP (__force __poll_t)0x8000 #define POLL_BUSY_LOOP (__force __poll_t)0x8000
......
...@@ -238,6 +238,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode) ...@@ -238,6 +238,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode)
} }
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
void __wake_up_pollfree(struct wait_queue_head *wq_head)
{
__wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE));
/* POLLFREE must have cleared the queue. */
WARN_ON_ONCE(waitqueue_active(wq_head));
}
/* /*
* Note: we use "set_current_state()" _after_ the wait-queue add, * Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any * because we need a memory barrier there on SMP, so that any
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment