Commit 8a81252b authored by Eric Dumazet's avatar Eric Dumazet Committed by Al Viro

fs/file.c: don't acquire files->file_lock in fd_install()

Mateusz Guzik reported :

 Currently obtaining a new file descriptor results in locking fdtable
 twice - once in order to reserve a slot and second time to fill it.

Holding the spinlock in __fd_install() is needed in case a resize is
done, or to prevent a resize.

Mateusz provided an RFC patch and a micro benchmark :
  http://people.redhat.com/~mguzik/pipebench.c

A resize is an unlikely operation in a process lifetime,
as table size is at least doubled at every resize.

We can use RCU instead of the spinlock.

__fd_install() must wait if a resize is in progress.

The resize must block new __fd_install() callers from starting,
and wait that ongoing install are finished (synchronize_sched())

resize should be attempted by a single thread to not waste resources.

rcu_sched variant is used, as __fd_install() and expand_fdtable() run
from process context.

It gives us a ~30% speedup using pipebench on a dual Intel(R) Xeon(R)
CPU E5-2696 v2 @ 2.50GHz
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Reported-by: default avatarMateusz Guzik <mguzik@redhat.com>
Acked-by: default avatarMateusz Guzik <mguzik@redhat.com>
Tested-by: default avatarMateusz Guzik <mguzik@redhat.com>
Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
parent 1af95de6
...@@ -500,3 +500,7 @@ in your dentry operations instead. ...@@ -500,3 +500,7 @@ in your dentry operations instead.
dentry, it does not get nameidata at all and it gets called only when cookie dentry, it does not get nameidata at all and it gets called only when cookie
is non-NULL. Note that link body isn't available anymore, so if you need it, is non-NULL. Note that link body isn't available anymore, so if you need it,
store it as cookie. store it as cookie.
--
[mandatory]
__fd_install() & fd_install() can now sleep. Callers should not
hold a spinlock or other resources that do not allow a schedule.
...@@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr) ...@@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr)
spin_unlock(&files->file_lock); spin_unlock(&files->file_lock);
new_fdt = alloc_fdtable(nr); new_fdt = alloc_fdtable(nr);
/* make sure all __fd_install() have seen resize_in_progress
* or have finished their rcu_read_lock_sched() section.
*/
if (atomic_read(&files->count) > 1)
synchronize_sched();
spin_lock(&files->file_lock); spin_lock(&files->file_lock);
if (!new_fdt) if (!new_fdt)
return -ENOMEM; return -ENOMEM;
...@@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr) ...@@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr)
__free_fdtable(new_fdt); __free_fdtable(new_fdt);
return -EMFILE; return -EMFILE;
} }
/*
* Check again since another task may have expanded the fd table while
* we dropped the lock
*/
cur_fdt = files_fdtable(files); cur_fdt = files_fdtable(files);
if (nr >= cur_fdt->max_fds) { BUG_ON(nr < cur_fdt->max_fds);
/* Continue as planned */
copy_fdtable(new_fdt, cur_fdt); copy_fdtable(new_fdt, cur_fdt);
rcu_assign_pointer(files->fdt, new_fdt); rcu_assign_pointer(files->fdt, new_fdt);
if (cur_fdt != &files->fdtab) if (cur_fdt != &files->fdtab)
call_rcu(&cur_fdt->rcu, free_fdtable_rcu); call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
} else { /* coupled with smp_rmb() in __fd_install() */
/* Somebody else expanded, so undo our attempt */ smp_wmb();
__free_fdtable(new_fdt);
}
return 1; return 1;
} }
...@@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr) ...@@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr)
* The files->file_lock should be held on entry, and will be held on exit. * The files->file_lock should be held on entry, and will be held on exit.
*/ */
static int expand_files(struct files_struct *files, int nr) static int expand_files(struct files_struct *files, int nr)
__releases(files->file_lock)
__acquires(files->file_lock)
{ {
struct fdtable *fdt; struct fdtable *fdt;
int expanded = 0;
repeat:
fdt = files_fdtable(files); fdt = files_fdtable(files);
/* Do we need to expand? */ /* Do we need to expand? */
if (nr < fdt->max_fds) if (nr < fdt->max_fds)
return 0; return expanded;
/* Can we expand? */ /* Can we expand? */
if (nr >= sysctl_nr_open) if (nr >= sysctl_nr_open)
return -EMFILE; return -EMFILE;
if (unlikely(files->resize_in_progress)) {
spin_unlock(&files->file_lock);
expanded = 1;
wait_event(files->resize_wait, !files->resize_in_progress);
spin_lock(&files->file_lock);
goto repeat;
}
/* All good, so we try */ /* All good, so we try */
return expand_fdtable(files, nr); files->resize_in_progress = true;
expanded = expand_fdtable(files, nr);
files->resize_in_progress = false;
wake_up_all(&files->resize_wait);
return expanded;
} }
static inline void __set_close_on_exec(int fd, struct fdtable *fdt) static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
...@@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) ...@@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
atomic_set(&newf->count, 1); atomic_set(&newf->count, 1);
spin_lock_init(&newf->file_lock); spin_lock_init(&newf->file_lock);
newf->resize_in_progress = false;
init_waitqueue_head(&newf->resize_wait);
newf->next_fd = 0; newf->next_fd = 0;
new_fdt = &newf->fdtab; new_fdt = &newf->fdtab;
new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->max_fds = NR_OPEN_DEFAULT;
...@@ -553,11 +572,21 @@ void __fd_install(struct files_struct *files, unsigned int fd, ...@@ -553,11 +572,21 @@ void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file) struct file *file)
{ {
struct fdtable *fdt; struct fdtable *fdt;
spin_lock(&files->file_lock);
fdt = files_fdtable(files); might_sleep();
rcu_read_lock_sched();
while (unlikely(files->resize_in_progress)) {
rcu_read_unlock_sched();
wait_event(files->resize_wait, !files->resize_in_progress);
rcu_read_lock_sched();
}
/* coupled with smp_wmb() in expand_fdtable() */
smp_rmb();
fdt = rcu_dereference_sched(files->fdt);
BUG_ON(fdt->fd[fd] != NULL); BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file); rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock); rcu_read_unlock_sched();
} }
void fd_install(unsigned int fd, struct file *file) void fd_install(unsigned int fd, struct file *file)
......
...@@ -47,6 +47,9 @@ struct files_struct { ...@@ -47,6 +47,9 @@ struct files_struct {
* read mostly part * read mostly part
*/ */
atomic_t count; atomic_t count;
bool resize_in_progress;
wait_queue_head_t resize_wait;
struct fdtable __rcu *fdt; struct fdtable __rcu *fdt;
struct fdtable fdtab; struct fdtable fdtab;
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment