Commit 7bc3e6e5 authored by Eric W. Biederman's avatar Eric W. Biederman

proc: Use a list of inodes to flush from proc

Rework the flushing of proc to use a list of directory inodes that
need to be flushed.

The list is kept on struct pid not on struct task_struct, as there is
a fixed connection between proc inodes and pids but at least for the
case of de_thread the pid of a task_struct changes.

This removes the dependency on proc_mnt which allows for different
mounts of proc having different mount options even in the same pid
namespace and this allows for the removal of proc_mnt which will
trivially the first mount of proc to honor it's mount options.

This flushing remains an optimization.  The functions
pid_delete_dentry and pid_revalidate ensure that ordinary dcache
management will not attempt to use dentries past the point their
respective task has died.  When unused the shrinker will
eventually be able to remove these dentries.

There is a case in de_thread where proc_flush_pid can be
called early for a given pid.  Which winds up being
safe (if suboptimal) as this is just an optiimization.

Only pid directories are put on the list as the other
per pid files are children of those directories and
d_invalidate on the directory will get them as well.

So that the pid can be used during flushing it's reference count is
taken in release_task and dropped in proc_flush_pid.  Further the call
of proc_flush_pid is moved after the tasklist_lock is released in
release_task so that it is certain that the pid has already been
unhashed when flushing it taking place.  This removes a small race
where a dentry could recreated.

As struct pid is supposed to be small and I need a per pid lock
I reuse the only lock that currently exists in struct pid the
the wait_pidfd.lock.

The net result is that this adds all of this functionality
with just a little extra list management overhead and
a single extra pointer in struct pid.

v2: Initialize pid->inodes.  I somehow failed to get that
    initialization into the initial version of the patch.  A boot
    failure was reported by "kernel test robot <lkp@intel.com>", and
    failure to initialize that pid->inodes matches all of the reported
    symptoms.
Signed-off-by: default avatarEric W. Biederman <ebiederm@xmission.com>
parent 71448011
...@@ -1834,11 +1834,25 @@ void task_dump_owner(struct task_struct *task, umode_t mode, ...@@ -1834,11 +1834,25 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
*rgid = gid; *rgid = gid;
} }
void proc_pid_evict_inode(struct proc_inode *ei)
{
struct pid *pid = ei->pid;
if (S_ISDIR(ei->vfs_inode.i_mode)) {
spin_lock(&pid->wait_pidfd.lock);
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(&pid->wait_pidfd.lock);
}
put_pid(pid);
}
struct inode *proc_pid_make_inode(struct super_block * sb, struct inode *proc_pid_make_inode(struct super_block * sb,
struct task_struct *task, umode_t mode) struct task_struct *task, umode_t mode)
{ {
struct inode * inode; struct inode * inode;
struct proc_inode *ei; struct proc_inode *ei;
struct pid *pid;
/* We need a new inode */ /* We need a new inode */
...@@ -1856,10 +1870,18 @@ struct inode *proc_pid_make_inode(struct super_block * sb, ...@@ -1856,10 +1870,18 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
/* /*
* grab the reference to task. * grab the reference to task.
*/ */
ei->pid = get_task_pid(task, PIDTYPE_PID); pid = get_task_pid(task, PIDTYPE_PID);
if (!ei->pid) if (!pid)
goto out_unlock; goto out_unlock;
/* Let the pid remember us for quick removal */
ei->pid = pid;
if (S_ISDIR(mode)) {
spin_lock(&pid->wait_pidfd.lock);
hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
spin_unlock(&pid->wait_pidfd.lock);
}
task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
security_task_to_inode(task, inode); security_task_to_inode(task, inode);
...@@ -3230,90 +3252,29 @@ static const struct inode_operations proc_tgid_base_inode_operations = { ...@@ -3230,90 +3252,29 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
.permission = proc_pid_permission, .permission = proc_pid_permission,
}; };
static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
{
struct dentry *dentry, *leader, *dir;
char buf[10 + 1];
struct qstr name;
name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%u", pid);
/* no ->d_hash() rejects on procfs */
dentry = d_hash_and_lookup(mnt->mnt_root, &name);
if (dentry) {
d_invalidate(dentry);
dput(dentry);
}
if (pid == tgid)
return;
name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%u", tgid);
leader = d_hash_and_lookup(mnt->mnt_root, &name);
if (!leader)
goto out;
name.name = "task";
name.len = strlen(name.name);
dir = d_hash_and_lookup(leader, &name);
if (!dir)
goto out_put_leader;
name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%u", pid);
dentry = d_hash_and_lookup(dir, &name);
if (dentry) {
d_invalidate(dentry);
dput(dentry);
}
dput(dir);
out_put_leader:
dput(leader);
out:
return;
}
/** /**
* proc_flush_task - Remove dcache entries for @task from the /proc dcache. * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache.
* @task: task that should be flushed. * @pid: pid that should be flushed.
* *
* When flushing dentries from proc, one needs to flush them from global * This function walks a list of inodes (that belong to any proc
* proc (proc_mnt) and from all the namespaces' procs this task was seen * filesystem) that are attached to the pid and flushes them from
* in. This call is supposed to do all of this job. * the dentry cache.
*
* Looks in the dcache for
* /proc/@pid
* /proc/@tgid/task/@pid
* if either directory is present flushes it and all of it'ts children
* from the dcache.
* *
* It is safe and reasonable to cache /proc entries for a task until * It is safe and reasonable to cache /proc entries for a task until
* that task exits. After that they just clog up the dcache with * that task exits. After that they just clog up the dcache with
* useless entries, possibly causing useful dcache entries to be * useless entries, possibly causing useful dcache entries to be
* flushed instead. This routine is proved to flush those useless * flushed instead. This routine is provided to flush those useless
* dcache entries at process exit time. * dcache entries when a process is reaped.
* *
* NOTE: This routine is just an optimization so it does not guarantee * NOTE: This routine is just an optimization so it does not guarantee
* that no dcache entries will exist at process exit time it * that no dcache entries will exist after a process is reaped
* just makes it very unlikely that any will persist. * it just makes it very unlikely that any will persist.
*/ */
void proc_flush_task(struct task_struct *task) void proc_flush_pid(struct pid *pid)
{ {
int i; proc_invalidate_siblings_dcache(&pid->inodes, &pid->wait_pidfd.lock);
struct pid *pid, *tgid; put_pid(pid);
struct upid *upid;
pid = task_pid(task);
tgid = task_tgid(task);
for (i = 0; i <= pid->level; i++) {
upid = &pid->numbers[i];
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}
} }
static struct dentry *proc_pid_instantiate(struct dentry * dentry, static struct dentry *proc_pid_instantiate(struct dentry * dentry,
......
...@@ -40,7 +40,7 @@ static void proc_evict_inode(struct inode *inode) ...@@ -40,7 +40,7 @@ static void proc_evict_inode(struct inode *inode)
/* Stop tracking associated processes */ /* Stop tracking associated processes */
if (ei->pid) { if (ei->pid) {
put_pid(ei->pid); proc_pid_evict_inode(ei);
ei->pid = NULL; ei->pid = NULL;
} }
......
...@@ -158,6 +158,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, ...@@ -158,6 +158,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
extern const struct dentry_operations pid_dentry_operations; extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern int proc_setattr(struct dentry *, struct iattr *); extern int proc_setattr(struct dentry *, struct iattr *);
extern void proc_pid_evict_inode(struct proc_inode *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
extern void pid_update_inode(struct task_struct *, struct inode *); extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *); extern int pid_delete_dentry(const struct dentry *);
......
...@@ -62,6 +62,7 @@ struct pid ...@@ -62,6 +62,7 @@ struct pid
unsigned int level; unsigned int level;
/* lists of tasks that use this pid */ /* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head tasks[PIDTYPE_MAX];
struct hlist_head inodes;
/* wait queue for pidfd notifications */ /* wait queue for pidfd notifications */
wait_queue_head_t wait_pidfd; wait_queue_head_t wait_pidfd;
struct rcu_head rcu; struct rcu_head rcu;
......
...@@ -32,7 +32,7 @@ struct proc_ops { ...@@ -32,7 +32,7 @@ struct proc_ops {
typedef int (*proc_write_t)(struct file *, char *, size_t); typedef int (*proc_write_t)(struct file *, char *, size_t);
extern void proc_root_init(void); extern void proc_root_init(void);
extern void proc_flush_task(struct task_struct *); extern void proc_flush_pid(struct pid *);
extern struct proc_dir_entry *proc_symlink(const char *, extern struct proc_dir_entry *proc_symlink(const char *,
struct proc_dir_entry *, const char *); struct proc_dir_entry *, const char *);
...@@ -105,7 +105,7 @@ static inline void proc_root_init(void) ...@@ -105,7 +105,7 @@ static inline void proc_root_init(void)
{ {
} }
static inline void proc_flush_task(struct task_struct *task) static inline void proc_flush_pid(struct pid *pid)
{ {
} }
......
...@@ -191,6 +191,7 @@ void put_task_struct_rcu_user(struct task_struct *task) ...@@ -191,6 +191,7 @@ void put_task_struct_rcu_user(struct task_struct *task)
void release_task(struct task_struct *p) void release_task(struct task_struct *p)
{ {
struct task_struct *leader; struct task_struct *leader;
struct pid *thread_pid;
int zap_leader; int zap_leader;
repeat: repeat:
/* don't need to get the RCU readlock here - the process is dead and /* don't need to get the RCU readlock here - the process is dead and
...@@ -199,11 +200,11 @@ void release_task(struct task_struct *p) ...@@ -199,11 +200,11 @@ void release_task(struct task_struct *p)
atomic_dec(&__task_cred(p)->user->processes); atomic_dec(&__task_cred(p)->user->processes);
rcu_read_unlock(); rcu_read_unlock();
proc_flush_task(p);
cgroup_release(p); cgroup_release(p);
write_lock_irq(&tasklist_lock); write_lock_irq(&tasklist_lock);
ptrace_release_task(p); ptrace_release_task(p);
thread_pid = get_pid(p->thread_pid);
__exit_signal(p); __exit_signal(p);
/* /*
...@@ -226,6 +227,7 @@ void release_task(struct task_struct *p) ...@@ -226,6 +227,7 @@ void release_task(struct task_struct *p)
} }
write_unlock_irq(&tasklist_lock); write_unlock_irq(&tasklist_lock);
proc_flush_pid(thread_pid);
release_thread(p); release_thread(p);
put_task_struct_rcu_user(p); put_task_struct_rcu_user(p);
......
...@@ -258,6 +258,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, ...@@ -258,6 +258,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
INIT_HLIST_HEAD(&pid->tasks[type]); INIT_HLIST_HEAD(&pid->tasks[type]);
init_waitqueue_head(&pid->wait_pidfd); init_waitqueue_head(&pid->wait_pidfd);
INIT_HLIST_HEAD(&pid->inodes);
upid = pid->numbers + ns->level; upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock); spin_lock_irq(&pidmap_lock);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment