Commit 58552705 authored by Davide Libenzi's avatar Davide Libenzi Committed by Ben Collins

[PATCH] epoll race fix

The was a race triggered by heavy MT usage that could have caused
processes in D state. Bad Davide, bad ...

Also, the semaphore is now per-epoll-fd and not global. Plus some comment
adjustment.
parent 4a07c099
...@@ -150,6 +150,14 @@ struct eventpoll { ...@@ -150,6 +150,14 @@ struct eventpoll {
/* Protect the this structure access */ /* Protect the this structure access */
rwlock_t lock; rwlock_t lock;
/*
* This semaphore is used to ensure that files are not removed
* while epoll is using them. This is read-held during the event
* collection loop and it is write-held during the file cleanup
* path, the epoll file exit code and the ctl operations.
*/
struct rw_semaphore sem;
/* Wait queue used by sys_epoll_wait() */ /* Wait queue used by sys_epoll_wait() */
wait_queue_head_t wq; wait_queue_head_t wq;
...@@ -279,16 +287,6 @@ static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type, ...@@ -279,16 +287,6 @@ static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
/* Safe wake up implementation */ /* Safe wake up implementation */
static struct poll_safewake psw; static struct poll_safewake psw;
/*
* This semaphore is used to ensure that files are not removed
* while epoll is using them. Namely the f_op->poll(), since
* it has to be called from outside the lock, must be protected.
* This is read-held during the event transfer loop to userspace
* and it is write-held during the file cleanup path and the epoll
* file exit code.
*/
static struct rw_semaphore epsem;
/* Slab cache used to allocate "struct epitem" */ /* Slab cache used to allocate "struct epitem" */
static kmem_cache_t *epi_cache; static kmem_cache_t *epi_cache;
...@@ -357,15 +355,14 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) ...@@ -357,15 +355,14 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
list_for_each(lnk, lsthead) { list_for_each(lnk, lsthead) {
tncur = list_entry(lnk, struct wake_task_node, llink); tncur = list_entry(lnk, struct wake_task_node, llink);
if (tncur->task == this_task) { if (tncur->wq == wq ||
if (tncur->wq == wq || ++wake_nests > EP_MAX_POLLWAKE_NESTS) { (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
/* /*
* Ops ... loop detected or maximum nest level reached. * Ops ... loop detected or maximum nest level reached.
* We abort this wake by breaking the cycle itself. * We abort this wake by breaking the cycle itself.
*/ */
spin_unlock_irqrestore(&psw->lock, flags); spin_unlock_irqrestore(&psw->lock, flags);
return; return;
}
} }
} }
...@@ -437,14 +434,15 @@ void eventpoll_release(struct file *file) ...@@ -437,14 +434,15 @@ void eventpoll_release(struct file *file)
* The only hit might come from ep_free() but by holding the semaphore * The only hit might come from ep_free() but by holding the semaphore
* will correctly serialize the operation. * will correctly serialize the operation.
*/ */
down_write(&epsem);
while (!list_empty(lsthead)) { while (!list_empty(lsthead)) {
epi = list_entry(lsthead->next, struct epitem, fllink); epi = list_entry(lsthead->next, struct epitem, fllink);
EP_LIST_DEL(&epi->fllink); EP_LIST_DEL(&epi->fllink);
down_write(&epi->ep->sem);
ep_remove(epi->ep, epi); ep_remove(epi->ep, epi);
up_write(&epi->ep->sem);
} }
up_write(&epsem);
} }
...@@ -568,9 +566,18 @@ asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event *even ...@@ -568,9 +566,18 @@ asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event *even
error = -EEXIST; error = -EEXIST;
break; break;
case EPOLL_CTL_DEL: case EPOLL_CTL_DEL:
if (epi) if (epi) {
/*
* We need to protect the remove operation because another
* thread might be doing an epoll_wait() and using the
* target file.
*/
down_write(&ep->sem);
error = ep_remove(ep, epi); error = ep_remove(ep, epi);
else
up_write(&ep->sem);
} else
error = -ENOENT; error = -ENOENT;
break; break;
case EPOLL_CTL_MOD: case EPOLL_CTL_MOD:
...@@ -703,10 +710,6 @@ static int ep_getfd(int *efd, struct inode **einode, struct file **efile) ...@@ -703,10 +710,6 @@ static int ep_getfd(int *efd, struct inode **einode, struct file **efile)
file->f_vfsmnt = mntget(eventpoll_mnt); file->f_vfsmnt = mntget(eventpoll_mnt);
file->f_dentry = dget(dentry); file->f_dentry = dget(dentry);
/*
* Initialize the file as read/write because it could be used
* with write() to add/remove/change interest sets.
*/
file->f_pos = 0; file->f_pos = 0;
file->f_flags = O_RDONLY; file->f_flags = O_RDONLY;
file->f_op = &eventpoll_fops; file->f_op = &eventpoll_fops;
...@@ -815,6 +818,7 @@ static int ep_init(struct eventpoll *ep, unsigned int hashbits) ...@@ -815,6 +818,7 @@ static int ep_init(struct eventpoll *ep, unsigned int hashbits)
unsigned int i, hsize; unsigned int i, hsize;
rwlock_init(&ep->lock); rwlock_init(&ep->lock);
init_rwsem(&ep->sem);
init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait); init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist); INIT_LIST_HEAD(&ep->rdllist);
...@@ -841,11 +845,15 @@ static void ep_free(struct eventpoll *ep) ...@@ -841,11 +845,15 @@ static void ep_free(struct eventpoll *ep)
struct list_head *lsthead, *lnk; struct list_head *lsthead, *lnk;
struct epitem *epi; struct epitem *epi;
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
ep_poll_safewake(&psw, &ep->poll_wait);
/* /*
* We need to lock this because we could be hit by * We need to lock this because we could be hit by
* eventpoll_release() while we're freeing the "struct eventpoll". * eventpoll_release() while we're freeing the "struct eventpoll".
*/ */
down_write(&epsem); down_write(&ep->sem);
/* /*
* Walks through the whole hash by unregistering poll callbacks. * Walks through the whole hash by unregistering poll callbacks.
...@@ -863,7 +871,7 @@ static void ep_free(struct eventpoll *ep) ...@@ -863,7 +871,7 @@ static void ep_free(struct eventpoll *ep)
/* /*
* Walks through the whole hash by freeing each "struct epitem". At this * Walks through the whole hash by freeing each "struct epitem". At this
* point we are sure no poll callbacks will be lingering around, and also by * point we are sure no poll callbacks will be lingering around, and also by
* write-holding "epsem" we can be sure that no file cleanup code will hit * write-holding "sem" we can be sure that no file cleanup code will hit
* us during this operation. So we can avoid the lock on "ep->lock". * us during this operation. So we can avoid the lock on "ep->lock".
*/ */
for (i = 0, hsize = 1 << ep->hashbits; i < hsize; i++) { for (i = 0, hsize = 1 << ep->hashbits; i < hsize; i++) {
...@@ -876,7 +884,7 @@ static void ep_free(struct eventpoll *ep) ...@@ -876,7 +884,7 @@ static void ep_free(struct eventpoll *ep)
} }
} }
up_write(&epsem); up_write(&ep->sem);
/* Free hash pages */ /* Free hash pages */
ep_free_pages(ep->hpages, EP_HASH_PAGES(ep->hashbits)); ep_free_pages(ep->hpages, EP_HASH_PAGES(ep->hashbits));
...@@ -1337,20 +1345,6 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist ...@@ -1337,20 +1345,6 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist
/* If this file is already in the ready list we exit soon */ /* If this file is already in the ready list we exit soon */
if (!EP_IS_LINKED(&epi->txlink)) { if (!EP_IS_LINKED(&epi->txlink)) {
/*
* We need to increase the usage count of the "struct epitem" because
* another thread might call EPOLL_CTL_DEL on this target and make the
* object to vanish underneath our nose.
*/
ep_use_epitem(epi);
/*
* We need to increase the usage count of the "struct file" because
* another thread might call close() on this target and make the file
* to vanish before we will be able to call f_op->poll().
*/
get_file(epi->file);
/* /*
* This is initialized in this way so that the default * This is initialized in this way so that the default
* behaviour of the reinjecting code will be to push back * behaviour of the reinjecting code will be to push back
...@@ -1389,19 +1383,21 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, ...@@ -1389,19 +1383,21 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
struct epitem *epi; struct epitem *epi;
struct epoll_event event[EP_MAX_BUF_EVENTS]; struct epoll_event event[EP_MAX_BUF_EVENTS];
/*
* We can loop without lock because this is a task private list.
* The test done during the collection loop will guarantee us that
* another task will not try to collect this file. Also, items
* cannot vanish during the loop because we are holding "sem".
*/
list_for_each(lnk, txlist) { list_for_each(lnk, txlist) {
epi = list_entry(lnk, struct epitem, txlink); epi = list_entry(lnk, struct epitem, txlink);
/* Get the ready file event set */
revents = epi->file->f_op->poll(epi->file, NULL);
/* /*
* Release the file usage before checking the event mask. * Get the ready file event set. We can safely use the file
* In case this call will lead to the file removal, its * because we are holding the "sem" in read and this will
* ->event.events member has been already set to zero and * guarantee that both the file and the item will not vanish.
* this will make the event to be dropped.
*/ */
fput(epi->file); revents = epi->file->f_op->poll(epi->file, NULL);
/* /*
* Set the return event set for the current file descriptor. * Set the return event set for the current file descriptor.
...@@ -1416,17 +1412,8 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, ...@@ -1416,17 +1412,8 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
eventbuf++; eventbuf++;
if (eventbuf == EP_MAX_BUF_EVENTS) { if (eventbuf == EP_MAX_BUF_EVENTS) {
if (__copy_to_user(&events[eventcnt], event, if (__copy_to_user(&events[eventcnt], event,
eventbuf * sizeof(struct epoll_event))) { eventbuf * sizeof(struct epoll_event)))
/*
* We need to complete the loop to decrement the file
* usage before returning from this function.
*/
for (lnk = lnk->next; lnk != txlist; lnk = lnk->next) {
epi = list_entry(lnk, struct epitem, txlink);
fput(epi->file);
}
return -EFAULT; return -EFAULT;
}
eventcnt += eventbuf; eventcnt += eventbuf;
eventbuf = 0; eventbuf = 0;
} }
...@@ -1447,7 +1434,8 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, ...@@ -1447,7 +1434,8 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
/* /*
* Walk through the transfer list we collected with ep_collect_ready_items() * Walk through the transfer list we collected with ep_collect_ready_items()
* and, if 1) the item is still "alive" 2) its event set is not empty 3) it's * and, if 1) the item is still "alive" 2) its event set is not empty 3) it's
* not already linked, links it to the ready list. * not already linked, links it to the ready list. Same as above, we are holding
* "sem" so items cannot vanish underneath our nose.
*/ */
static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist) static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
{ {
...@@ -1475,8 +1463,6 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist) ...@@ -1475,8 +1463,6 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
list_add_tail(&epi->rdllink, &ep->rdllist); list_add_tail(&epi->rdllink, &ep->rdllist);
ricnt++; ricnt++;
} }
ep_release_epitem(epi);
} }
if (ricnt) { if (ricnt) {
...@@ -1510,17 +1496,12 @@ static int ep_events_transfer(struct eventpoll *ep, struct epoll_event *events, ...@@ -1510,17 +1496,12 @@ static int ep_events_transfer(struct eventpoll *ep, struct epoll_event *events,
/* /*
* We need to lock this because we could be hit by * We need to lock this because we could be hit by
* eventpoll_release() while we're transfering * eventpoll_release() and epoll_ctl(EPOLL_CTL_DEL).
* events to userspace. Read-holding "epsem" will lock
* out eventpoll_release() during the whole
* transfer loop and this will garantie us that the
* file will not vanish underneath our nose when
* we will call f_op->poll() from ep_send_events().
*/ */
down_read(&epsem); down_read(&ep->sem);
/* Collect/extract ready items */ /* Collect/extract ready items */
if (ep_collect_ready_items(ep, &txlist, maxevents)) { if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) {
/* Build result set in userspace */ /* Build result set in userspace */
eventcnt = ep_send_events(ep, &txlist, events); eventcnt = ep_send_events(ep, &txlist, events);
...@@ -1528,7 +1509,7 @@ static int ep_events_transfer(struct eventpoll *ep, struct epoll_event *events, ...@@ -1528,7 +1509,7 @@ static int ep_events_transfer(struct eventpoll *ep, struct epoll_event *events,
ep_reinject_items(ep, &txlist); ep_reinject_items(ep, &txlist);
} }
up_read(&epsem); up_read(&ep->sem);
return eventcnt; return eventcnt;
} }
...@@ -1652,9 +1633,6 @@ static int __init eventpoll_init(void) ...@@ -1652,9 +1633,6 @@ static int __init eventpoll_init(void)
{ {
int error; int error;
/* Initialize the semaphore used to syncronize the file cleanup code */
init_rwsem(&epsem);
/* Initialize the structure used to perform safe poll wait head wake ups */ /* Initialize the structure used to perform safe poll wait head wake ups */
ep_poll_safewake_init(&psw); ep_poll_safewake_init(&psw);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment