[PATCH] epoll race fix

The was a race triggered by heavy MT usage that could have caused processes in D state. Bad Davide, bad ... Also, the semaphore is now per-epoll-fd and not global. Plus some comment adjustment.

[PATCH] epoll race fix
The was a race triggered by heavy MT usage that could have caused processes in D state. Bad Davide, bad ... Also, the semaphore is now per-epoll-fd and not global. Plus some comment adjustment.
58552705 · Davide Libenzi · Ben Collins · 4a07c099 · 58552705
Commit 58552705 authored Jun 01, 2003 by Davide Libenzi Committed by Ben Collins Jun 01, 2003
Hide whitespace changes
Inline Side-by-side

Showing with 55 additions and 77 deletions

fs/eventpoll.c fs/eventpoll.c +55 -77

No files found.
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -150,6 +150,14 @@ struct eventpoll {
 	/* Protect the this structure access */
 	rwlock_t lock;

+	/*
+	 * This semaphore is used to ensure that files are not removed
+	 * while epoll is using them. This is read-held during the event
+	 * collection loop and it is write-held during the file cleanup
+	 * path, the epoll file exit code and the ctl operations.
+	 */
+	struct rw_semaphore sem;
+
 	/* Wait queue used by sys_epoll_wait() */
 	wait_queue_head_t wq;

@@ -279,16 +287,6 @@ static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
 /* Safe wake up implementation */
 static struct poll_safewake psw;

-/*
- * This semaphore is used to ensure that files are not removed
- * while epoll is using them. Namely the f_op->poll(), since
- * it has to be called from outside the lock, must be protected.
- * This is read-held during the event transfer loop to userspace
- * and it is write-held during the file cleanup path and the epoll
- * file exit code.
- */
-static struct rw_semaphore epsem;
-
 /* Slab cache used to allocate "struct epitem" */
 static kmem_cache_t *epi_cache;

@@ -357,15 +355,14 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
 	list_for_each(lnk, lsthead) {
 		tncur = list_entry(lnk, struct wake_task_node, llink);

-		if (tncur->task == this_task) {
-			if (tncur->wq == wq || ++wake_nests > EP_MAX_POLLWAKE_NESTS) {
-				/*
-				 * Ops ... loop detected or maximum nest level reached.
-				 * We abort this wake by breaking the cycle itself.
-				 */
-				spin_unlock_irqrestore(&psw->lock, flags);
-				return;
-			}
+		if (tncur->wq == wq ||
+		    (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
+			/*
+			 * Ops ... loop detected or maximum nest level reached.
+			 * We abort this wake by breaking the cycle itself.
+			 */
+			spin_unlock_irqrestore(&psw->lock, flags);
+			return;
 		}
 	}

@@ -437,14 +434,15 @@ void eventpoll_release(struct file *file)
 	 * The only hit might come from ep_free() but by holding the semaphore
 	 * will correctly serialize the operation.
 	 */
-	down_write(&epsem);
+
 	while (!list_empty(lsthead)) {
 		epi = list_entry(lsthead->next, struct epitem, fllink);

 		EP_LIST_DEL(&epi->fllink);
+		down_write(&epi->ep->sem);
 		ep_remove(epi->ep, epi);
+		up_write(&epi->ep->sem);
 	}
-	up_write(&epsem);
 }


@@ -568,9 +566,18 @@ asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event *even
 			error = -EEXIST;
 		break;
 	case EPOLL_CTL_DEL:
-		if (epi)
+		if (epi) {
+			/*
+			 * We need to protect the remove operation because another
+			 * thread might be doing an epoll_wait() and using the
+			 * target file.
+			 */
+			down_write(&ep->sem);
+
 			error = ep_remove(ep, epi);
-		else
+
+			up_write(&ep->sem);
+		} else
 			error = -ENOENT;
 		break;
 	case EPOLL_CTL_MOD:
@@ -703,10 +710,6 @@ static int ep_getfd(int *efd, struct inode **einode, struct file **efile)
 	file->f_vfsmnt = mntget(eventpoll_mnt);
 	file->f_dentry = dget(dentry);

-	/*
-	 * Initialize the file as read/write because it could be used
-	 * with write() to add/remove/change interest sets.
-	 */
 	file->f_pos = 0;
 	file->f_flags = O_RDONLY;
 	file->f_op = &eventpoll_fops;
@@ -815,6 +818,7 @@ static int ep_init(struct eventpoll *ep, unsigned int hashbits)
 	unsigned int i, hsize;

 	rwlock_init(&ep->lock);
+	init_rwsem(&ep->sem);
 	init_waitqueue_head(&ep->wq);
 	init_waitqueue_head(&ep->poll_wait);
 	INIT_LIST_HEAD(&ep->rdllist);
@@ -841,11 +845,15 @@ static void ep_free(struct eventpoll *ep)
 	struct list_head *lsthead, *lnk;
 	struct epitem *epi;

+	/* We need to release all tasks waiting for these file */
+	if (waitqueue_active(&ep->poll_wait))
+		ep_poll_safewake(&psw, &ep->poll_wait);
+
 	/*
 	 * We need to lock this because we could be hit by
 	 * eventpoll_release() while we're freeing the "struct eventpoll".
 	 */
-	down_write(&epsem);
+	down_write(&ep->sem);

 	/*
 	 * Walks through the whole hash by unregistering poll callbacks.
@@ -863,7 +871,7 @@ static void ep_free(struct eventpoll *ep)
 	/*
 	 * Walks through the whole hash by freeing each "struct epitem". At this
 	 * point we are sure no poll callbacks will be lingering around, and also by
-	 * write-holding "epsem" we can be sure that no file cleanup code will hit
+	 * write-holding "sem" we can be sure that no file cleanup code will hit
 	 * us during this operation. So we can avoid the lock on "ep->lock".
 	 */
 	for (i = 0, hsize = 1 << ep->hashbits; i < hsize; i++) {
@@ -876,7 +884,7 @@ static void ep_free(struct eventpoll *ep)
 		}
 	}

-	up_write(&epsem);
+	up_write(&ep->sem);

 	/* Free hash pages */
 	ep_free_pages(ep->hpages, EP_HASH_PAGES(ep->hashbits));
@@ -1337,20 +1345,6 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist

 		/* If this file is already in the ready list we exit soon */
 		if (!EP_IS_LINKED(&epi->txlink)) {
-			/*
-			 * We need to increase the usage count of the "struct epitem" because
-			 * another thread might call EPOLL_CTL_DEL on this target and make the
-			 * object to vanish underneath our nose.
-			 */
-			ep_use_epitem(epi);
-
-			/*
-			 * We need to increase the usage count of the "struct file" because
-			 * another thread might call close() on this target and make the file
-			 * to vanish before we will be able to call f_op->poll().
-			 */
-			get_file(epi->file);
-
 			/*
 			 * This is initialized in this way so that the default
 			 * behaviour of the reinjecting code will be to push back
@@ -1389,19 +1383,21 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
 	struct epitem *epi;
 	struct epoll_event event[EP_MAX_BUF_EVENTS];

+	/*
+	 * We can loop without lock because this is a task private list.
+	 * The test done during the collection loop will guarantee us that
+	 * another task will not try to collect this file. Also, items
+	 * cannot vanish during the loop because we are holding "sem".
+	 */
 	list_for_each(lnk, txlist) {
 		epi = list_entry(lnk, struct epitem, txlink);

-		/* Get the ready file event set */
-		revents = epi->file->f_op->poll(epi->file, NULL);
-
 		/*
-		 * Release the file usage before checking the event mask.
-		 * In case this call will lead to the file removal, its
-		 * ->event.events member has been already set to zero and
-		 * this will make the event to be dropped.
+		 * Get the ready file event set. We can safely use the file
+		 * because we are holding the "sem" in read and this will
+		 * guarantee that both the file and the item will not vanish.
 		 */
-		fput(epi->file);
+		revents = epi->file->f_op->poll(epi->file, NULL);

 		/*
 		 * Set the return event set for the current file descriptor.
@@ -1416,17 +1412,8 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
 			eventbuf++;
 			if (eventbuf == EP_MAX_BUF_EVENTS) {
 				if (__copy_to_user(&events[eventcnt], event,
-						   eventbuf * sizeof(struct epoll_event))) {
-					/*
-					 * We need to complete the loop to decrement the file
-					 * usage before returning from this function.
-					 */
-					for (lnk = lnk->next; lnk != txlist; lnk = lnk->next) {
-						epi = list_entry(lnk, struct epitem, txlink);
-						fput(epi->file);
-					}
+						   eventbuf * sizeof(struct epoll_event)))
 					return -EFAULT;
-				}
 				eventcnt += eventbuf;
 				eventbuf = 0;
 			}
@@ -1447,7 +1434,8 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
 /*
 * Walk through the transfer list we collected with ep_collect_ready_items()
 * and, if 1) the item is still "alive" 2) its event set is not empty 3) it's
- * not already linked, links it to the ready list.
+ * not already linked, links it to the ready list. Same as above, we are holding
+ * "sem" so items cannot vanish underneath our nose.
 */
 static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
 {
@@ -1475,8 +1463,6 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
 			list_add_tail(&epi->rdllink, &ep->rdllist);
 			ricnt++;
 		}
-
-		ep_release_epitem(epi);
 	}

 	if (ricnt) {
@@ -1510,17 +1496,12 @@ static int ep_events_transfer(struct eventpoll *ep, struct epoll_event *events,

 	/*
 	 * We need to lock this because we could be hit by
-	 * eventpoll_release() while we're transfering
-	 * events to userspace. Read-holding "epsem" will lock
-	 * out eventpoll_release() during the whole
-	 * transfer loop and this will garantie us that the
-	 * file will not vanish underneath our nose when
-	 * we will call f_op->poll() from ep_send_events().
+	 * eventpoll_release() and epoll_ctl(EPOLL_CTL_DEL).
 	 */
-	down_read(&epsem);
+	down_read(&ep->sem);

 	/* Collect/extract ready items */
-	if (ep_collect_ready_items(ep, &txlist, maxevents)) {
+	if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) {
 		/* Build result set in userspace */
 		eventcnt = ep_send_events(ep, &txlist, events);

@@ -1528,7 +1509,7 @@ static int ep_events_transfer(struct eventpoll *ep, struct epoll_event *events,
 		ep_reinject_items(ep, &txlist);
 	}

-	up_read(&epsem);
+	up_read(&ep->sem);

 	return eventcnt;
 }
@@ -1652,9 +1633,6 @@ static int __init eventpoll_init(void)
 {
 	int error;

-	/* Initialize the semaphore used to syncronize the file cleanup code */
-	init_rwsem(&epsem);
-
 	/* Initialize the structure used to perform safe poll wait head wake ups */
 	ep_poll_safewake_init(&psw);