Commit a13ae697 authored by Eric W. Biederman's avatar Eric W. Biederman

proc: Dentry flushing without proc_mnt

Cleanly handling proc mount options require the internal mount of
proc to be removed (so mount options are not ignored), and quite
possibly multiple proc superblocks per pid namespace (so a
second mount of proc does not silently get the mount options of the
first mount of proc.  In either case being able to flush proc
dentries on process exit needs to be made to work without going
through proc_mnt.  After serveral discussions this is the set
of changes that work and no one objects to.

---

I have addressed all of the review comments as I understand them,
and fixed the small oversight the kernel test robot was able to
find. (I had failed to initialize the new field pid->inodes).

I did not hear any concerns from the 10,000 foot level last time
so I am assuming this set of changes (baring bugs) is good to go.

Unless some new issues appear my plan is to put this in my tree
and get this into linux-next.  Which will give Alexey something
to build his changes on.

I tested this set of changes by running:
 (while ls -1 -f /proc > /dev/null ; do :; done ) &
And monitoring the amount of free memory.

With the flushing disabled I saw the used memory in the system grow by
20M before the shrinker would bring it back down to where it started.
With the patch applied I saw the memory usage stay essentially fixed.

So flushing definitely keeps things working better.

Eric W. Biederman (6):
      proc: Rename in proc_inode rename sysctl_inodes sibling_inodes
      proc: Generalize proc_sys_prune_dcache into proc_prune_siblings_dcache
      proc: In proc_prune_siblings_dcache cache an aquired super block
      proc: Use d_invalidate in proc_prune_siblings_dcache
      proc: Clear the pieces of proc_inode that proc_evict_inode cares about
      proc: Use a list of inodes to flush from proc

 fs/proc/base.c          | 111 ++++++++++++++++--------------------------------
 fs/proc/inode.c         |  73 ++++++++++++++++++++++++++++---
 fs/proc/internal.h      |   4 +-
 fs/proc/proc_sysctl.c   |  45 +++-----------------
 include/linux/pid.h     |   1 +
 include/linux/proc_fs.h |   4 +-
 kernel/exit.c           |   4 +-
 kernel/pid.c            |   1 +
 8 files changed, 120 insertions(+), 123 deletions(-)

Link: https://lore.kernel.org/lkml/871rqk2brn.fsf_-_@x220.int.ebiederm.org/Signed-off-by: default avatar"Eric W. Biederman" <ebiederm@xmission.com>

Merge branch 'proc-dentry-flushing-without-proc-mnt-v2' into HEAD
parents 11a48a5a 7bc3e6e5
......@@ -1834,11 +1834,25 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
*rgid = gid;
}
void proc_pid_evict_inode(struct proc_inode *ei)
{
struct pid *pid = ei->pid;
if (S_ISDIR(ei->vfs_inode.i_mode)) {
spin_lock(&pid->wait_pidfd.lock);
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(&pid->wait_pidfd.lock);
}
put_pid(pid);
}
struct inode *proc_pid_make_inode(struct super_block * sb,
struct task_struct *task, umode_t mode)
{
struct inode * inode;
struct proc_inode *ei;
struct pid *pid;
/* We need a new inode */
......@@ -1856,10 +1870,18 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
/*
* grab the reference to task.
*/
ei->pid = get_task_pid(task, PIDTYPE_PID);
if (!ei->pid)
pid = get_task_pid(task, PIDTYPE_PID);
if (!pid)
goto out_unlock;
/* Let the pid remember us for quick removal */
ei->pid = pid;
if (S_ISDIR(mode)) {
spin_lock(&pid->wait_pidfd.lock);
hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
spin_unlock(&pid->wait_pidfd.lock);
}
task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
security_task_to_inode(task, inode);
......@@ -3230,90 +3252,29 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
.permission = proc_pid_permission,
};
static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
{
struct dentry *dentry, *leader, *dir;
char buf[10 + 1];
struct qstr name;
name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%u", pid);
/* no ->d_hash() rejects on procfs */
dentry = d_hash_and_lookup(mnt->mnt_root, &name);
if (dentry) {
d_invalidate(dentry);
dput(dentry);
}
if (pid == tgid)
return;
name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%u", tgid);
leader = d_hash_and_lookup(mnt->mnt_root, &name);
if (!leader)
goto out;
name.name = "task";
name.len = strlen(name.name);
dir = d_hash_and_lookup(leader, &name);
if (!dir)
goto out_put_leader;
name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%u", pid);
dentry = d_hash_and_lookup(dir, &name);
if (dentry) {
d_invalidate(dentry);
dput(dentry);
}
dput(dir);
out_put_leader:
dput(leader);
out:
return;
}
/**
* proc_flush_task - Remove dcache entries for @task from the /proc dcache.
* @task: task that should be flushed.
* proc_flush_pid - Remove dcache entries for @pid from the /proc dcache.
* @pid: pid that should be flushed.
*
* When flushing dentries from proc, one needs to flush them from global
* proc (proc_mnt) and from all the namespaces' procs this task was seen
* in. This call is supposed to do all of this job.
*
* Looks in the dcache for
* /proc/@pid
* /proc/@tgid/task/@pid
* if either directory is present flushes it and all of it'ts children
* from the dcache.
* This function walks a list of inodes (that belong to any proc
* filesystem) that are attached to the pid and flushes them from
* the dentry cache.
*
* It is safe and reasonable to cache /proc entries for a task until
* that task exits. After that they just clog up the dcache with
* useless entries, possibly causing useful dcache entries to be
* flushed instead. This routine is proved to flush those useless
* dcache entries at process exit time.
* flushed instead. This routine is provided to flush those useless
* dcache entries when a process is reaped.
*
* NOTE: This routine is just an optimization so it does not guarantee
* that no dcache entries will exist at process exit time it
* just makes it very unlikely that any will persist.
* that no dcache entries will exist after a process is reaped
* it just makes it very unlikely that any will persist.
*/
void proc_flush_task(struct task_struct *task)
void proc_flush_pid(struct pid *pid)
{
int i;
struct pid *pid, *tgid;
struct upid *upid;
pid = task_pid(task);
tgid = task_tgid(task);
for (i = 0; i <= pid->level; i++) {
upid = &pid->numbers[i];
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}
proc_invalidate_siblings_dcache(&pid->inodes, &pid->wait_pidfd.lock);
put_pid(pid);
}
static struct dentry *proc_pid_instantiate(struct dentry * dentry,
......
......@@ -33,21 +33,27 @@ static void proc_evict_inode(struct inode *inode)
{
struct proc_dir_entry *de;
struct ctl_table_header *head;
struct proc_inode *ei = PROC_I(inode);
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
/* Stop tracking associated processes */
put_pid(PROC_I(inode)->pid);
if (ei->pid) {
proc_pid_evict_inode(ei);
ei->pid = NULL;
}
/* Let go of any associated proc directory entry */
de = PDE(inode);
if (de)
de = ei->pde;
if (de) {
pde_put(de);
ei->pde = NULL;
}
head = PROC_I(inode)->sysctl;
head = ei->sysctl;
if (head) {
RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
RCU_INIT_POINTER(ei->sysctl, NULL);
proc_sys_evict_inode(inode, head);
}
}
......@@ -68,6 +74,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei->pde = NULL;
ei->sysctl = NULL;
ei->sysctl_entry = NULL;
INIT_HLIST_NODE(&ei->sibling_inodes);
ei->ns_ops = NULL;
return &ei->vfs_inode;
}
......@@ -102,6 +109,62 @@ void __init proc_init_kmemcache(void)
BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
}
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
{
struct inode *inode;
struct proc_inode *ei;
struct hlist_node *node;
struct super_block *old_sb = NULL;
rcu_read_lock();
for (;;) {
struct super_block *sb;
node = hlist_first_rcu(inodes);
if (!node)
break;
ei = hlist_entry(node, struct proc_inode, sibling_inodes);
spin_lock(lock);
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(lock);
inode = &ei->vfs_inode;
sb = inode->i_sb;
if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active))
continue;
inode = igrab(inode);
rcu_read_unlock();
if (sb != old_sb) {
if (old_sb)
deactivate_super(old_sb);
old_sb = sb;
}
if (unlikely(!inode)) {
rcu_read_lock();
continue;
}
if (S_ISDIR(inode->i_mode)) {
struct dentry *dir = d_find_any_alias(inode);
if (dir) {
d_invalidate(dir);
dput(dir);
}
} else {
struct dentry *dentry;
while ((dentry = d_find_alias(inode))) {
d_invalidate(dentry);
dput(dentry);
}
}
iput(inode);
rcu_read_lock();
}
rcu_read_unlock();
if (old_sb)
deactivate_super(old_sb);
}
static int proc_show_options(struct seq_file *seq, struct dentry *root)
{
struct super_block *sb = root->d_sb;
......
......@@ -91,7 +91,7 @@ struct proc_inode {
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
struct ctl_table *sysctl_entry;
struct hlist_node sysctl_inodes;
struct hlist_node sibling_inodes;
const struct proc_ns_operations *ns_ops;
struct inode vfs_inode;
} __randomize_layout;
......@@ -158,6 +158,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern int proc_setattr(struct dentry *, struct iattr *);
extern void proc_pid_evict_inode(struct proc_inode *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
......@@ -210,6 +211,7 @@ extern const struct inode_operations proc_pid_link_inode_operations;
extern const struct super_operations proc_sops;
void proc_init_kmemcache(void);
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern void proc_entry_rundown(struct proc_dir_entry *);
......
......@@ -267,42 +267,9 @@ static void unuse_table(struct ctl_table_header *p)
complete(p->unregistering);
}
static void proc_sys_prune_dcache(struct ctl_table_header *head)
static void proc_sys_invalidate_dcache(struct ctl_table_header *head)
{
struct inode *inode;
struct proc_inode *ei;
struct hlist_node *node;
struct super_block *sb;
rcu_read_lock();
for (;;) {
node = hlist_first_rcu(&head->inodes);
if (!node)
break;
ei = hlist_entry(node, struct proc_inode, sysctl_inodes);
spin_lock(&sysctl_lock);
hlist_del_init_rcu(&ei->sysctl_inodes);
spin_unlock(&sysctl_lock);
inode = &ei->vfs_inode;
sb = inode->i_sb;
if (!atomic_inc_not_zero(&sb->s_active))
continue;
inode = igrab(inode);
rcu_read_unlock();
if (unlikely(!inode)) {
deactivate_super(sb);
rcu_read_lock();
continue;
}
d_prune_aliases(inode);
iput(inode);
deactivate_super(sb);
rcu_read_lock();
}
rcu_read_unlock();
proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock);
}
/* called under sysctl_lock, will reacquire if has to wait */
......@@ -324,10 +291,10 @@ static void start_unregistering(struct ctl_table_header *p)
spin_unlock(&sysctl_lock);
}
/*
* Prune dentries for unregistered sysctls: namespaced sysctls
* Invalidate dentries for unregistered sysctls: namespaced sysctls
* can have duplicate names and contaminate dcache very badly.
*/
proc_sys_prune_dcache(p);
proc_sys_invalidate_dcache(p);
/*
* do not remove from the list until nobody holds it; walking the
* list in do_sysctl() relies on that.
......@@ -483,7 +450,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
}
ei->sysctl = head;
ei->sysctl_entry = table;
hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes);
hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes);
head->count++;
spin_unlock(&sysctl_lock);
......@@ -514,7 +481,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
{
spin_lock(&sysctl_lock);
hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes);
hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes);
if (!--head->count)
kfree_rcu(head, rcu);
spin_unlock(&sysctl_lock);
......
......@@ -62,6 +62,7 @@ struct pid
unsigned int level;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
struct hlist_head inodes;
/* wait queue for pidfd notifications */
wait_queue_head_t wait_pidfd;
struct rcu_head rcu;
......
......@@ -32,7 +32,7 @@ struct proc_ops {
typedef int (*proc_write_t)(struct file *, char *, size_t);
extern void proc_root_init(void);
extern void proc_flush_task(struct task_struct *);
extern void proc_flush_pid(struct pid *);
extern struct proc_dir_entry *proc_symlink(const char *,
struct proc_dir_entry *, const char *);
......@@ -105,7 +105,7 @@ static inline void proc_root_init(void)
{
}
static inline void proc_flush_task(struct task_struct *task)
static inline void proc_flush_pid(struct pid *pid)
{
}
......
......@@ -191,6 +191,7 @@ void put_task_struct_rcu_user(struct task_struct *task)
void release_task(struct task_struct *p)
{
struct task_struct *leader;
struct pid *thread_pid;
int zap_leader;
repeat:
/* don't need to get the RCU readlock here - the process is dead and
......@@ -199,11 +200,11 @@ void release_task(struct task_struct *p)
atomic_dec(&__task_cred(p)->user->processes);
rcu_read_unlock();
proc_flush_task(p);
cgroup_release(p);
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
thread_pid = get_pid(p->thread_pid);
__exit_signal(p);
/*
......@@ -226,6 +227,7 @@ void release_task(struct task_struct *p)
}
write_unlock_irq(&tasklist_lock);
proc_flush_pid(thread_pid);
release_thread(p);
put_task_struct_rcu_user(p);
......
......@@ -258,6 +258,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
INIT_HLIST_HEAD(&pid->tasks[type]);
init_waitqueue_head(&pid->wait_pidfd);
INIT_HLIST_HEAD(&pid->inodes);
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment