Commit f3f86e33 authored by Linus Torvalds's avatar Linus Torvalds

vfs: Fix pathological performance case for __alloc_fd()

Al Viro points out that:
> >     * [Linux-specific aside] our __alloc_fd() can degrade quite badly
> > with some use patterns.  The cacheline pingpong in the bitmap is probably
> > inevitable, unless we accept considerably heavier memory footprint,
> > but we also have a case when alloc_fd() takes O(n) and it's _not_ hard
> > to trigger - close(3);open(...); will have the next open() after that
> > scanning the entire in-use bitmap.

And Eric Dumazet has a somewhat realistic multithreaded microbenchmark
that opens and closes a lot of sockets with minimal work per socket.

This patch largely fixes it.  We keep a 2nd-level bitmap of the open
file bitmaps, showing which words are already full.  So then we can
traverse that second-level bitmap to efficiently skip already allocated
file descriptors.

On his benchmark, this improves performance by up to an order of
magnitude, by avoiding the excessive open file bitmap scanning.
Tested-and-acked-by: default avatarEric Dumazet <edumazet@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 8a28d674
...@@ -56,6 +56,9 @@ static void free_fdtable_rcu(struct rcu_head *rcu) ...@@ -56,6 +56,9 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
__free_fdtable(container_of(rcu, struct fdtable, rcu)); __free_fdtable(container_of(rcu, struct fdtable, rcu));
} }
#define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr))
#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long))
/* /*
* Expand the fdset in the files_struct. Called with the files spinlock * Expand the fdset in the files_struct. Called with the files spinlock
* held for write. * held for write.
...@@ -77,6 +80,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) ...@@ -77,6 +80,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
memset((char *)(nfdt->open_fds) + cpy, 0, set); memset((char *)(nfdt->open_fds) + cpy, 0, set);
memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
memset((char *)(nfdt->close_on_exec) + cpy, 0, set); memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
cpy = BITBIT_SIZE(ofdt->max_fds);
set = BITBIT_SIZE(nfdt->max_fds) - cpy;
memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
memset(cpy+(char *)nfdt->full_fds_bits, 0, set);
} }
static struct fdtable * alloc_fdtable(unsigned int nr) static struct fdtable * alloc_fdtable(unsigned int nr)
...@@ -115,12 +123,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr) ...@@ -115,12 +123,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
fdt->fd = data; fdt->fd = data;
data = alloc_fdmem(max_t(size_t, data = alloc_fdmem(max_t(size_t,
2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES));
if (!data) if (!data)
goto out_arr; goto out_arr;
fdt->open_fds = data; fdt->open_fds = data;
data += nr / BITS_PER_BYTE; data += nr / BITS_PER_BYTE;
fdt->close_on_exec = data; fdt->close_on_exec = data;
data += nr / BITS_PER_BYTE;
fdt->full_fds_bits = data;
return fdt; return fdt;
...@@ -229,14 +239,18 @@ static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) ...@@ -229,14 +239,18 @@ static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
__clear_bit(fd, fdt->close_on_exec); __clear_bit(fd, fdt->close_on_exec);
} }
static inline void __set_open_fd(int fd, struct fdtable *fdt) static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
{ {
__set_bit(fd, fdt->open_fds); __set_bit(fd, fdt->open_fds);
fd /= BITS_PER_LONG;
if (!~fdt->open_fds[fd])
__set_bit(fd, fdt->full_fds_bits);
} }
static inline void __clear_open_fd(int fd, struct fdtable *fdt) static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{ {
__clear_bit(fd, fdt->open_fds); __clear_bit(fd, fdt->open_fds);
__clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
} }
static int count_open_files(struct fdtable *fdt) static int count_open_files(struct fdtable *fdt)
...@@ -280,6 +294,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) ...@@ -280,6 +294,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->max_fds = NR_OPEN_DEFAULT;
new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->close_on_exec = newf->close_on_exec_init;
new_fdt->open_fds = newf->open_fds_init; new_fdt->open_fds = newf->open_fds_init;
new_fdt->full_fds_bits = newf->full_fds_bits_init;
new_fdt->fd = &newf->fd_array[0]; new_fdt->fd = &newf->fd_array[0];
spin_lock(&oldf->file_lock); spin_lock(&oldf->file_lock);
...@@ -323,6 +338,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) ...@@ -323,6 +338,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8); memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8);
memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8); memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8);
memcpy(new_fdt->full_fds_bits, old_fdt->full_fds_bits, BITBIT_SIZE(open_files));
for (i = open_files; i != 0; i--) { for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++; struct file *f = *old_fds++;
...@@ -454,10 +470,25 @@ struct files_struct init_files = { ...@@ -454,10 +470,25 @@ struct files_struct init_files = {
.fd = &init_files.fd_array[0], .fd = &init_files.fd_array[0],
.close_on_exec = init_files.close_on_exec_init, .close_on_exec = init_files.close_on_exec_init,
.open_fds = init_files.open_fds_init, .open_fds = init_files.open_fds_init,
.full_fds_bits = init_files.full_fds_bits_init,
}, },
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
}; };
static unsigned long find_next_fd(struct fdtable *fdt, unsigned long start)
{
unsigned long maxfd = fdt->max_fds;
unsigned long maxbit = maxfd / BITS_PER_LONG;
unsigned long bitbit = start / BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
if (bitbit > maxfd)
return maxfd;
if (bitbit > start)
start = bitbit;
return find_next_zero_bit(fdt->open_fds, maxfd, start);
}
/* /*
* allocate a file descriptor, mark it busy. * allocate a file descriptor, mark it busy.
*/ */
...@@ -476,7 +507,7 @@ int __alloc_fd(struct files_struct *files, ...@@ -476,7 +507,7 @@ int __alloc_fd(struct files_struct *files,
fd = files->next_fd; fd = files->next_fd;
if (fd < fdt->max_fds) if (fd < fdt->max_fds)
fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); fd = find_next_fd(fdt, fd);
/* /*
* N.B. For clone tasks sharing a files structure, this test * N.B. For clone tasks sharing a files structure, this test
......
...@@ -26,6 +26,7 @@ struct fdtable { ...@@ -26,6 +26,7 @@ struct fdtable {
struct file __rcu **fd; /* current fd array */ struct file __rcu **fd; /* current fd array */
unsigned long *close_on_exec; unsigned long *close_on_exec;
unsigned long *open_fds; unsigned long *open_fds;
unsigned long *full_fds_bits;
struct rcu_head rcu; struct rcu_head rcu;
}; };
...@@ -59,6 +60,7 @@ struct files_struct { ...@@ -59,6 +60,7 @@ struct files_struct {
int next_fd; int next_fd;
unsigned long close_on_exec_init[1]; unsigned long close_on_exec_init[1];
unsigned long open_fds_init[1]; unsigned long open_fds_init[1];
unsigned long full_fds_bits_init[1];
struct file __rcu * fd_array[NR_OPEN_DEFAULT]; struct file __rcu * fd_array[NR_OPEN_DEFAULT];
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment