[PATCH] epoll: replace the file lookup hash with rbtrees

The epoll allocation for the fd lookup hash used to allocate up to 1MB (depending on the "hint" size passed to epoll_create()) with __get_free_pages(0), and this might lead to a "malicious" user to do something like: for (i = 0; i < 1024; i++) epoll_create(BIG-NUM); You can replace "malicious user" with IBM-ltp test suite, and the meaning does not change. The above code might exhaust memory badly, even before the file creation limit is topped. Also, the allocation was independent from the number of fds pushed into the epoll fd hash. Using an rb-tree ther will be not pre-allocation of the hash, and the size of the memory used will be proportional to the number of fds pushed into the epoll fd. The patch also removes 100 lines of code, that is never a bad thing ;) Signed-off-by: Davide Libenzi <davidel@xmailserver.org> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

[PATCH] epoll: replace the file lookup hash with rbtrees
The epoll allocation for the fd lookup hash used to allocate up to 1MB (depending on the "hint" size passed to epoll_create()) with __get_free_pages(0), and this might lead to a "malicious" user to do something like: for (i = 0; i < 1024; i++) epoll_create(BIG-NUM); You can replace "malicious user" with IBM-ltp test suite, and the meaning does not change. The above code might exhaust memory badly, even before the file creation limit is topped. Also, the allocation was independent from the number of fds pushed into the epoll fd hash. Using an rb-tree ther will be not pre-allocation of the hash, and the size of the memory used will be proportional to the number of fds pushed into the epoll fd. The patch also removes 100 lines of code, that is never a bad thing ;) Signed-off-by: Davide Libenzi <davidel@xmailserver.org> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
3ca54916 · Davide Libenzi · Linus Torvalds · b2dbffab · 3ca54916
Commit 3ca54916 authored Jun 17, 2004 by Davide Libenzi Committed by Linus Torvalds Jun 17, 2004
Show whitespace changes
Inline Side-by-side

Showing with 93 additions and 187 deletions

fs/eventpoll.c fs/eventpoll.c +93 -187

No files found.
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -29,6 +29,7 @@
 #include <linux/spinlock.h>
 #include <linux/syscalls.h>
 #include <linux/rwsem.h>
+#include <linux/rbtree.h>
 #include <linux/wait.h>
 #include <linux/eventpoll.h>
 #include <linux/mount.h>
@@ -100,22 +101,6 @@
 /* Maximum number of poll wake up nests we are allowing */
 #define EP_MAX_POLLWAKE_NESTS 4
-/* Maximum size of the hash in bits ( 2^N ) */
-#define EP_MAX_HASH_BITS 17
-/* Minimum size of the hash in bits ( 2^N ) */
-#define EP_MIN_HASH_BITS 9
-/* Number of hash entries ( "struct list_head" ) inside a page */
-#define EP_HENTRY_X_PAGE (PAGE_SIZE / sizeof(struct list_head))
-/* Maximum size of the hash in pages */
-#define EP_MAX_HPAGES ((1 << EP_MAX_HASH_BITS) / EP_HENTRY_X_PAGE + 1)
-/* Number of pages allocated for an "hbits" sized hash table */
-#define EP_HASH_PAGES(hbits) ((int) ((1 << (hbits)) / EP_HENTRY_X_PAGE + \
-				     ((1 << (hbits)) % EP_HENTRY_X_PAGE ? 1: 0)))
 /* Macro to allocate a "struct epitem" from the slab cache */
 #define EPI_MEM_ALLOC()	(struct epitem *) kmem_cache_alloc(epi_cache, SLAB_KERNEL)
@@ -131,6 +116,22 @@
 /* Fast test to see if the file is an evenpoll file */
 #define IS_FILE_EPOLL(f) ((f)->f_op == &eventpoll_fops)
+/* Setup the structure that is used as key for the rb-tree */
+#define EP_SET_FFD(p, f, d) do { (p)->file = (f); (p)->fd = (d); } while (0)
+/* Compare rb-tree keys */
+#define EP_CMP_FFD(p1, p2) ((p1)->file > (p2)->file ? +1: \
+			    ((p1)->file < (p2)->file ? -1: (p1)->fd - (p2)->fd))
+/* Special initialization for the rb-tree node to detect linkage */
+#define EP_RB_INITNODE(n) (n)->rb_parent = (n)
+/* Removes a node from the rb-tree and marks it for a fast is-linked check */
+#define EP_RB_ERASE(n, r) do { rb_erase(n, r); (n)->rb_parent = (n); } while (0)
+/* Fast check to verify that the item is linked to the main rb-tree */
+#define EP_RB_LINKED(n) ((n)->rb_parent != (n))
 /*
 * Remove the item from the list and perform its initialization.
 * This is useful for us because we can test if the item is linked
@@ -147,6 +148,12 @@
 /* Get the "struct epitem" from an epoll queue wrapper */
 #define EP_ITEM_FROM_EPQUEUE(p) (container_of(p, struct ep_pqueue, pt)->epi)
+struct epoll_filefd {
+	struct file *file;
+	int fd;
+};
 /*
 * Node that is linked into the "wake_task_list" member of the "struct poll_safewake".
 * It is used to keep track on all tasks that are currently inside the wake_up() code
@@ -195,11 +202,8 @@ struct eventpoll {
 	/* List of ready file descriptors */
 	struct list_head rdllist;
-	/* Size of the hash */
+	/* RB-Tree root used to store monitored fd structs */
-	unsigned int hashbits;
+	struct rb_root rbr;
-	/* Pages for the "struct epitem" hash */
-	char *hpages[EP_MAX_HPAGES];
 };
 /* Wait structure used by the poll hooks */
@@ -225,14 +229,14 @@ struct eppoll_entry {
 * have an entry of this type linked to the hash.
 */
 struct epitem {
-	/* List header used to link this structure to the eventpoll hash */
+	/* RB-Tree node used to link this structure to the eventpoll rb-tree */
-	struct list_head llink;
+	struct rb_node rbn;
 	/* List header used to link this structure to the eventpoll ready list */
 	struct list_head rdllink;
-	/* The file descriptor this item refers to */
+	/* The file descriptor information this item refers to */
-	int fd;
+	struct epoll_filefd ffd;
 	/* Number of active wait queue attached to poll operations */
 	int nwait;
@@ -243,9 +247,6 @@ struct epitem {
 	/* The "container" of this item */
 	struct eventpoll *ep;
-	/* The file this item refers to */
-	struct file *file;
 	/* The structure that describe the interested events and the source fd */
 	struct epoll_event event;
@@ -278,22 +279,15 @@ struct ep_pqueue {
 static void ep_poll_safewake_init(struct poll_safewake *psw);
 static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq);
-static unsigned int ep_get_hash_bits(unsigned int hintsize);
 static int ep_getfd(int *efd, struct inode **einode, struct file **efile);
-static int ep_alloc_pages(char **pages, int numpages);
+static int ep_file_init(struct file *file);
-static int ep_free_pages(char **pages, int numpages);
-static int ep_file_init(struct file *file, unsigned int hashbits);
-static unsigned int ep_hash_index(struct eventpoll *ep, struct file *file,
-				  int fd);
-static struct list_head *ep_hash_entry(struct eventpoll *ep,
-				       unsigned int index);
-static int ep_init(struct eventpoll *ep, unsigned int hashbits);
 static void ep_free(struct eventpoll *ep);
 static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
 static void ep_use_epitem(struct epitem *epi);
 static void ep_release_epitem(struct epitem *epi);
 static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
 				 poll_table *pt);
+static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi);
 static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 		     struct file *tfile, int fd);
 static int ep_modify(struct eventpoll *ep, struct epitem *epi,
@@ -424,19 +418,6 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
 }
-/*
- * Calculate the size of the hash in bits. The returned size will be
- * bounded between EP_MIN_HASH_BITS and EP_MAX_HASH_BITS.
- */
-static unsigned int ep_get_hash_bits(unsigned int hintsize)
-{
-	unsigned int i, val;
-	for (i = 0, val = 1; val < hintsize && i < EP_MAX_HASH_BITS; i++, val <<= 1);
-	return i <  EP_MIN_HASH_BITS ?	EP_MIN_HASH_BITS: i;
-}
 /* Used to initialize the epoll bits inside the "struct file" */
 void eventpoll_init_file(struct file *file)
 {
@@ -492,7 +473,6 @@ void eventpoll_release_file(struct file *file)
 asmlinkage long sys_epoll_create(int size)
 {
 	int error, fd;
-	unsigned int hashbits;
 	struct inode *inode;
 	struct file *file;
@@ -504,9 +484,6 @@ asmlinkage long sys_epoll_create(int size)
 	if (size <= 0)
 		goto eexit_1;
-	/* Correctly size the hash */
-	hashbits = ep_get_hash_bits((unsigned int) size);
 	/*
 	 * Creates all the items needed to setup an eventpoll file. That is,
 	 * a file structure, and inode and a free file descriptor.
@@ -516,7 +493,7 @@ asmlinkage long sys_epoll_create(int size)
 		goto eexit_1;
 	/* Setup the file internal data structure ( "struct eventpoll" ) */
-	error = ep_file_init(file, hashbits);
+	error = ep_file_init(file);
 	if (error)
 		goto eexit_2;
@@ -768,114 +745,32 @@ static int ep_getfd(int *efd, struct inode **einode, struct file **efile)
 }
-static int ep_alloc_pages(char **pages, int numpages)
+static int ep_file_init(struct file *file)
 {
-	int i;
-	for (i = 0; i < numpages; i++) {
-		pages[i] = (char *) __get_free_pages(GFP_KERNEL, 0);
-		if (!pages[i]) {
-			for (--i; i >= 0; i--) {
-				ClearPageReserved(virt_to_page(pages[i]));
-				free_pages((unsigned long) pages[i], 0);
-			}
-			return -ENOMEM;
-		}
-		SetPageReserved(virt_to_page(pages[i]));
-	}
-	return 0;
-}
-static int ep_free_pages(char **pages, int numpages)
-{
-	int i;
-	for (i = 0; i < numpages; i++) {
-		ClearPageReserved(virt_to_page(pages[i]));
-		free_pages((unsigned long) pages[i], 0);
-	}
-	return 0;
-}
-static int ep_file_init(struct file *file, unsigned int hashbits)
-{
-	int error;
 	struct eventpoll *ep;
 	if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL)))
 		return -ENOMEM;
 	memset(ep, 0, sizeof(*ep));
-	error = ep_init(ep, hashbits);
-	if (error) {
-		kfree(ep);
-		return error;
-	}
-	file->private_data = ep;
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n",
-		     current, ep));
-	return 0;
-}
-/*
- * Calculate the index of the hash relative to "file".
- */
-static unsigned int ep_hash_index(struct eventpoll *ep, struct file *file, int fd)
-{
-	unsigned long ptr = (unsigned long) file ^ (fd << ep->hashbits);
-	return (unsigned int) hash_ptr((void *) ptr, ep->hashbits);
-}
-/*
- * Returns the hash entry ( struct list_head * ) of the passed index.
- */
-static struct list_head *ep_hash_entry(struct eventpoll *ep, unsigned int index)
-{
-	return (struct list_head *) (ep->hpages[index / EP_HENTRY_X_PAGE] +
-				     (index % EP_HENTRY_X_PAGE) * sizeof(struct list_head));
-}
-static int ep_init(struct eventpoll *ep, unsigned int hashbits)
-{
-	int error;
-	unsigned int i, hsize;
 	rwlock_init(&ep->lock);
 	init_rwsem(&ep->sem);
 	init_waitqueue_head(&ep->wq);
 	init_waitqueue_head(&ep->poll_wait);
 	INIT_LIST_HEAD(&ep->rdllist);
+	ep->rbr = RB_ROOT;
-	/* Hash allocation and setup */
+	file->private_data = ep;
-	ep->hashbits = hashbits;
-	error = ep_alloc_pages(ep->hpages, EP_HASH_PAGES(ep->hashbits));
-	if (error)
-		goto eexit_1;
-	/* Initialize hash buckets */
-	for (i = 0, hsize = 1 << hashbits; i < hsize; i++)
-		INIT_LIST_HEAD(ep_hash_entry(ep, i));
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n",
+		     current, ep));
 	return 0;
-eexit_1:
-	return error;
 }
 static void ep_free(struct eventpoll *ep)
 {
-	unsigned int i, hsize;
+	struct rb_node *rbp;
-	struct list_head *lsthead, *lnk;
 	struct epitem *epi;
 	/* We need to release all tasks waiting for these file */
@@ -893,17 +788,13 @@ static void ep_free(struct eventpoll *ep)
 	down(&epsem);
 	/*
-	 * Walks through the whole hash by unregistering poll callbacks.
+	 * Walks through the whole tree by unregistering poll callbacks.
 	 */
-	for (i = 0, hsize = 1 << ep->hashbits; i < hsize; i++) {
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
-		lsthead = ep_hash_entry(ep, i);
+		epi = rb_entry(rbp, struct epitem, rbn);
-		list_for_each(lnk, lsthead) {
-			epi = list_entry(lnk, struct epitem, llink);
 		ep_unregister_pollwait(ep, epi);
 	}
-	}
 	/*
 	 * Walks through the whole hash by freeing each "struct epitem". At this
@@ -911,20 +802,12 @@ static void ep_free(struct eventpoll *ep)
 	 * write-holding "sem" we can be sure that no file cleanup code will hit
 	 * us during this operation. So we can avoid the lock on "ep->lock".
 	 */
-	for (i = 0, hsize = 1 << ep->hashbits; i < hsize; i++) {
+	while ((rbp = rb_first(&ep->rbr))) {
-		lsthead = ep_hash_entry(ep, i);
+		epi = rb_entry(rbp, struct epitem, rbn);
-		while (!list_empty(lsthead)) {
-			epi = list_entry(lsthead->next, struct epitem, llink);
 		ep_remove(ep, epi);
 	}
-	}
 	up(&epsem);
-	/* Free hash pages */
-	ep_free_pages(ep->hpages, EP_HASH_PAGES(ep->hashbits));
 }
@@ -935,29 +818,33 @@ static void ep_free(struct eventpoll *ep)
 */
 static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 {
+	int kcmp;
 	unsigned long flags;
-	struct list_head *lsthead, *lnk;
+	struct rb_node *rbp;
-	struct epitem *epi = NULL;
+	struct epitem *epi, *epir = NULL;
+	struct epoll_filefd ffd;
+	EP_SET_FFD(&ffd, file, fd);
 	read_lock_irqsave(&ep->lock, flags);
+	for (rbp = ep->rbr.rb_node; rbp; ) {
-	lsthead = ep_hash_entry(ep, ep_hash_index(ep, file, fd));
+		epi = rb_entry(rbp, struct epitem, rbn);
-	list_for_each(lnk, lsthead) {
+		kcmp = EP_CMP_FFD(&ffd, &epi->ffd);
-		epi = list_entry(lnk, struct epitem, llink);
+		if (kcmp > 0)
+			rbp = rbp->rb_right;
-		if (epi->file == file && epi->fd == fd) {
+		else if (kcmp < 0)
+			rbp = rbp->rb_left;
+		else {
 			ep_use_epitem(epi);
+			epir = epi;
 			break;
 		}
-		epi = NULL;
 	}
 	read_unlock_irqrestore(&ep->lock, flags);
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
-		     current, file, epi));
+		     current, file, epir));
-	return epi;
+	return epir;
 }
@@ -1009,6 +896,26 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
 }
+static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
+{
+	int kcmp;
+	struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;
+	struct epitem *epic;
+	while (*p) {
+		parent = *p;
+		epic = rb_entry(parent, struct epitem, rbn);
+		kcmp = EP_CMP_FFD(&epi->ffd, &epic->ffd);
+		if (kcmp > 0)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&epi->rbn, parent, p);
+	rb_insert_color(&epi->rbn, &ep->rbr);
+}
 static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 		     struct file *tfile, int fd)
 {
@@ -1022,14 +929,13 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 		goto eexit_1;
 	/* Item initialization follow here ... */
-	INIT_LIST_HEAD(&epi->llink);
+	EP_RB_INITNODE(&epi->rbn);
 	INIT_LIST_HEAD(&epi->rdllink);
 	INIT_LIST_HEAD(&epi->fllink);
 	INIT_LIST_HEAD(&epi->txlink);
 	INIT_LIST_HEAD(&epi->pwqlist);
 	epi->ep = ep;
-	epi->file = tfile;
+	EP_SET_FFD(&epi->ffd, tfile, fd);
-	epi->fd = fd;
 	epi->event = *event;
 	atomic_set(&epi->usecnt, 1);
 	epi->nwait = 0;
@@ -1061,8 +967,8 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	/* We have to drop the new item inside our item list to keep track of it */
 	write_lock_irqsave(&ep->lock, flags);
-	/* Add the current item to the hash table */
+	/* Add the current item to the rb-tree */
-	list_add(&epi->llink, ep_hash_entry(ep, ep_hash_index(ep, tfile, fd)));
+	ep_rbtree_insert(ep, epi);
 	/* If the file is already "ready" we drop it inside the ready list */
 	if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) {
@@ -1126,7 +1032,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	 * Get current event bits. We can safely use the file* here because
 	 * its usage count has been increased by the caller of this function.
 	 */
-	revents = epi->file->f_op->poll(epi->file, NULL);
+	revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
 	write_lock_irqsave(&ep->lock, flags);
@@ -1137,7 +1043,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	 * If the item is not linked to the hash it means that it's on its
 	 * way toward the removal. Do nothing in this case.
 	 */
-	if (EP_IS_LINKED(&epi->llink)) {
+	if (EP_RB_LINKED(&epi->rbn)) {
 		/*
 		 * If the item is "hot" and it is not registered inside the ready
 		 * list, push it inside. If the item is not "hot" and it is currently
@@ -1205,7 +1111,7 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi)
 	 * The check protect us from doing a double unlink ( crash ).
 	 */
 	error = -ENOENT;
-	if (!EP_IS_LINKED(&epi->llink))
+	if (!EP_RB_LINKED(&epi->rbn))
 		goto eexit_1;
 	/*
@@ -1216,11 +1122,11 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi)
 	epi->event.events = 0;
 	/*
-	 * At this point is safe to do the job, unlink the item from our list.
+	 * At this point is safe to do the job, unlink the item from our rb-tree.
 	 * This operation togheter with the above check closes the door to
 	 * double unlinks.
 	 */
-	EP_LIST_DEL(&epi->llink);
+	EP_RB_ERASE(&epi->rbn, &ep->rbr);
 	/*
 	 * If the item we are going to remove is inside the ready file descriptors
@@ -1247,7 +1153,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 {
 	int error;
 	unsigned long flags;
-	struct file *file = epi->file;
+	struct file *file = epi->ffd.file;
 	/*
 	 * Removes poll wait queue hooks. We _have_ to do this without holding
@@ -1446,7 +1352,7 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
 		 * because we are holding the "sem" in read and this will
 		 * guarantee that both the file and the item will not vanish.
 		 */
-		revents = epi->file->f_op->poll(epi->file, NULL);
+		revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
 		/*
 		 * Set the return event set for the current file descriptor.
@@ -1497,7 +1403,7 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
 		 * item is set to have an Edge Triggered behaviour, we don't have
 		 * to push it back either.
 		 */
-		if (EP_IS_LINKED(&epi->llink) && !(epi->event.events & EPOLLET) &&
+		if (EP_RB_LINKED(&epi->rbn) && !(epi->event.events & EPOLLET) &&
 		    (epi->revents & epi->event.events) && !EP_IS_LINKED(&epi->rdllink)) {
 			list_add_tail(&epi->rdllink, &ep->rdllist);
 			ricnt++;