Commit cbcfa130 authored by Eric Biggers's avatar Eric Biggers Committed by Linus Torvalds

fs/userfaultfd.c: disable irqs for fault_pending and event locks

When IOCB_CMD_POLL is used on a userfaultfd, aio_poll() disables IRQs
and takes kioctx::ctx_lock, then userfaultfd_ctx::fd_wqh.lock.

This may have to wait for userfaultfd_ctx::fd_wqh.lock to be released by
userfaultfd_ctx_read(), which in turn can be waiting for
userfaultfd_ctx::fault_pending_wqh.lock or
userfaultfd_ctx::event_wqh.lock.

But elsewhere the fault_pending_wqh and event_wqh locks are taken with
IRQs enabled.  Since the IRQ handler may take kioctx::ctx_lock, lockdep
reports that a deadlock is possible.

Fix it by always disabling IRQs when taking the fault_pending_wqh and
event_wqh locks.

Commit ae62c16e ("userfaultfd: disable irqs when taking the
waitqueue lock") didn't fix this because it only accounted for the
fd_wqh lock, not the other locks nested inside it.

Link: http://lkml.kernel.org/r/20190627075004.21259-1-ebiggers@kernel.org
Fixes: bfe4037e ("aio: implement IOCB_CMD_POLL")
Signed-off-by: default avatarEric Biggers <ebiggers@google.com>
Reported-by: syzbot+fab6de82892b6b9c6191@syzkaller.appspotmail.com
Reported-by: syzbot+53c0b767f7ca0dc0c451@syzkaller.appspotmail.com
Reported-by: syzbot+a3accb352f9c22041cfa@syzkaller.appspotmail.com
Reviewed-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: <stable@vger.kernel.org>	[4.19+]
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent b9705d87
...@@ -40,6 +40,16 @@ enum userfaultfd_state { ...@@ -40,6 +40,16 @@ enum userfaultfd_state {
/* /*
* Start with fault_pending_wqh and fault_wqh so they're more likely * Start with fault_pending_wqh and fault_wqh so they're more likely
* to be in the same cacheline. * to be in the same cacheline.
*
* Locking order:
* fd_wqh.lock
* fault_pending_wqh.lock
* fault_wqh.lock
* event_wqh.lock
*
* To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
* since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
* also taken in IRQ context.
*/ */
struct userfaultfd_ctx { struct userfaultfd_ctx {
/* waitqueue head for the pending (i.e. not read) userfaults */ /* waitqueue head for the pending (i.e. not read) userfaults */
...@@ -458,7 +468,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) ...@@ -458,7 +468,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
blocking_state = return_to_userland ? TASK_INTERRUPTIBLE : blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
TASK_KILLABLE; TASK_KILLABLE;
spin_lock(&ctx->fault_pending_wqh.lock); spin_lock_irq(&ctx->fault_pending_wqh.lock);
/* /*
* After the __add_wait_queue the uwq is visible to userland * After the __add_wait_queue the uwq is visible to userland
* through poll/read(). * through poll/read().
...@@ -470,7 +480,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) ...@@ -470,7 +480,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
* __add_wait_queue. * __add_wait_queue.
*/ */
set_current_state(blocking_state); set_current_state(blocking_state);
spin_unlock(&ctx->fault_pending_wqh.lock); spin_unlock_irq(&ctx->fault_pending_wqh.lock);
if (!is_vm_hugetlb_page(vmf->vma)) if (!is_vm_hugetlb_page(vmf->vma))
must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
...@@ -552,13 +562,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) ...@@ -552,13 +562,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
* kernel stack can be released after the list_del_init. * kernel stack can be released after the list_del_init.
*/ */
if (!list_empty_careful(&uwq.wq.entry)) { if (!list_empty_careful(&uwq.wq.entry)) {
spin_lock(&ctx->fault_pending_wqh.lock); spin_lock_irq(&ctx->fault_pending_wqh.lock);
/* /*
* No need of list_del_init(), the uwq on the stack * No need of list_del_init(), the uwq on the stack
* will be freed shortly anyway. * will be freed shortly anyway.
*/ */
list_del(&uwq.wq.entry); list_del(&uwq.wq.entry);
spin_unlock(&ctx->fault_pending_wqh.lock); spin_unlock_irq(&ctx->fault_pending_wqh.lock);
} }
/* /*
...@@ -583,7 +593,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, ...@@ -583,7 +593,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
init_waitqueue_entry(&ewq->wq, current); init_waitqueue_entry(&ewq->wq, current);
release_new_ctx = NULL; release_new_ctx = NULL;
spin_lock(&ctx->event_wqh.lock); spin_lock_irq(&ctx->event_wqh.lock);
/* /*
* After the __add_wait_queue the uwq is visible to userland * After the __add_wait_queue the uwq is visible to userland
* through poll/read(). * through poll/read().
...@@ -613,15 +623,15 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, ...@@ -613,15 +623,15 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
break; break;
} }
spin_unlock(&ctx->event_wqh.lock); spin_unlock_irq(&ctx->event_wqh.lock);
wake_up_poll(&ctx->fd_wqh, EPOLLIN); wake_up_poll(&ctx->fd_wqh, EPOLLIN);
schedule(); schedule();
spin_lock(&ctx->event_wqh.lock); spin_lock_irq(&ctx->event_wqh.lock);
} }
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
spin_unlock(&ctx->event_wqh.lock); spin_unlock_irq(&ctx->event_wqh.lock);
if (release_new_ctx) { if (release_new_ctx) {
struct vm_area_struct *vma; struct vm_area_struct *vma;
...@@ -918,10 +928,10 @@ static int userfaultfd_release(struct inode *inode, struct file *file) ...@@ -918,10 +928,10 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
* the last page faults that may have been already waiting on * the last page faults that may have been already waiting on
* the fault_*wqh. * the fault_*wqh.
*/ */
spin_lock(&ctx->fault_pending_wqh.lock); spin_lock_irq(&ctx->fault_pending_wqh.lock);
__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
spin_unlock(&ctx->fault_pending_wqh.lock); spin_unlock_irq(&ctx->fault_pending_wqh.lock);
/* Flush pending events that may still wait on event_wqh */ /* Flush pending events that may still wait on event_wqh */
wake_up_all(&ctx->event_wqh); wake_up_all(&ctx->event_wqh);
...@@ -1134,7 +1144,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, ...@@ -1134,7 +1144,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
if (!ret && msg->event == UFFD_EVENT_FORK) { if (!ret && msg->event == UFFD_EVENT_FORK) {
ret = resolve_userfault_fork(ctx, fork_nctx, msg); ret = resolve_userfault_fork(ctx, fork_nctx, msg);
spin_lock(&ctx->event_wqh.lock); spin_lock_irq(&ctx->event_wqh.lock);
if (!list_empty(&fork_event)) { if (!list_empty(&fork_event)) {
/* /*
* The fork thread didn't abort, so we can * The fork thread didn't abort, so we can
...@@ -1180,7 +1190,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, ...@@ -1180,7 +1190,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
if (ret) if (ret)
userfaultfd_ctx_put(fork_nctx); userfaultfd_ctx_put(fork_nctx);
} }
spin_unlock(&ctx->event_wqh.lock); spin_unlock_irq(&ctx->event_wqh.lock);
} }
return ret; return ret;
...@@ -1219,14 +1229,14 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, ...@@ -1219,14 +1229,14 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
static void __wake_userfault(struct userfaultfd_ctx *ctx, static void __wake_userfault(struct userfaultfd_ctx *ctx,
struct userfaultfd_wake_range *range) struct userfaultfd_wake_range *range)
{ {
spin_lock(&ctx->fault_pending_wqh.lock); spin_lock_irq(&ctx->fault_pending_wqh.lock);
/* wake all in the range and autoremove */ /* wake all in the range and autoremove */
if (waitqueue_active(&ctx->fault_pending_wqh)) if (waitqueue_active(&ctx->fault_pending_wqh))
__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
range); range);
if (waitqueue_active(&ctx->fault_wqh)) if (waitqueue_active(&ctx->fault_wqh))
__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
spin_unlock(&ctx->fault_pending_wqh.lock); spin_unlock_irq(&ctx->fault_pending_wqh.lock);
} }
static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
...@@ -1881,7 +1891,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) ...@@ -1881,7 +1891,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
wait_queue_entry_t *wq; wait_queue_entry_t *wq;
unsigned long pending = 0, total = 0; unsigned long pending = 0, total = 0;
spin_lock(&ctx->fault_pending_wqh.lock); spin_lock_irq(&ctx->fault_pending_wqh.lock);
list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
pending++; pending++;
total++; total++;
...@@ -1889,7 +1899,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) ...@@ -1889,7 +1899,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
total++; total++;
} }
spin_unlock(&ctx->fault_pending_wqh.lock); spin_unlock_irq(&ctx->fault_pending_wqh.lock);
/* /*
* If more protocols will be added, there will be all shown * If more protocols will be added, there will be all shown
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment