Commit a0d4a141 authored by Eric W. Biederman's avatar Eric W. Biederman

Proc mount option handling is broken, and it has been since I

accidentally broke it in the middle 2016.

The problem is that because we perform an internal mount of proc
before user space mounts proc all of the mount options that user
specifies when mounting proc are ignored.

You can set those mount options with a remount but that is rather
surprising.

This most directly affects android which is using hidpid=2 by default.

Now that the sysctl system call support has been removed, and we have
settled on way of flushing proc dentries when a process exits without
using proc_mnt, there is an simple and easy fix.

a) Give UML mconsole it's own private mount of proc to use.
b) Stop creating the internal mount of proc

We still need Alexey Gladkov's full patch to get proc mount options to
work inside of UML, and to be generally useful.  This set of changes
is just enough to get them working as well as they have in the past.

If anyone sees any problem with this code please let me know.

Otherwise I plan to merge these set of fixes through my tree.

Link: https://lore.kernel.org/lkml/87r21tuulj.fsf@x220.int.ebiederm.org/
Link: https://lore.kernel.org/lkml/871rqk2brn.fsf_-_@x220.int.ebiederm.org/
Link: https://lore.kernel.org/lkml/20200210150519.538333-1-gladkov.alexey@gmail.com/
Link: https://lore.kernel.org/lkml/20180611195744.154962-1-astrachan@google.com/
Fixes: e94591d0 ("proc: Convert proc_mount to use mount_ns.")

Eric W. Biederman (4):
      uml: Don't consult current to find the proc_mnt in mconsole_proc
      uml: Create a private mount of proc for mconsole
      proc: Remove the now unnecessary internal mount of proc
      pid: Improve the comment about waiting in zap_pid_ns_processes

 arch/um/drivers/mconsole_kern.c | 28 +++++++++++++++++++++++++++-
 fs/proc/root.c                  | 36 ------------------------------------
 include/linux/pid_namespace.h   |  2 --
 include/linux/proc_ns.h         |  5 -----
 kernel/pid.c                    |  8 --------
 kernel/pid_namespace.c          | 38 +++++++++++++++++++-------------------
 6 files changed, 46 insertions(+), 71 deletions(-)
parents a13ae697 af9fe6d6
...@@ -36,6 +36,8 @@ ...@@ -36,6 +36,8 @@
#include "mconsole_kern.h" #include "mconsole_kern.h"
#include <os.h> #include <os.h>
static struct vfsmount *proc_mnt = NULL;
static int do_unlink_socket(struct notifier_block *notifier, static int do_unlink_socket(struct notifier_block *notifier,
unsigned long what, void *data) unsigned long what, void *data)
{ {
...@@ -123,7 +125,7 @@ void mconsole_log(struct mc_request *req) ...@@ -123,7 +125,7 @@ void mconsole_log(struct mc_request *req)
void mconsole_proc(struct mc_request *req) void mconsole_proc(struct mc_request *req)
{ {
struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt; struct vfsmount *mnt = proc_mnt;
char *buf; char *buf;
int len; int len;
struct file *file; struct file *file;
...@@ -134,6 +136,10 @@ void mconsole_proc(struct mc_request *req) ...@@ -134,6 +136,10 @@ void mconsole_proc(struct mc_request *req)
ptr += strlen("proc"); ptr += strlen("proc");
ptr = skip_spaces(ptr); ptr = skip_spaces(ptr);
if (!mnt) {
mconsole_reply(req, "Proc not available", 1, 0);
goto out;
}
file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0); file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0);
if (IS_ERR(file)) { if (IS_ERR(file)) {
mconsole_reply(req, "Failed to open file", 1, 0); mconsole_reply(req, "Failed to open file", 1, 0);
...@@ -683,6 +689,24 @@ void mconsole_stack(struct mc_request *req) ...@@ -683,6 +689,24 @@ void mconsole_stack(struct mc_request *req)
with_console(req, stack_proc, to); with_console(req, stack_proc, to);
} }
static int __init mount_proc(void)
{
struct file_system_type *proc_fs_type;
struct vfsmount *mnt;
proc_fs_type = get_fs_type("proc");
if (!proc_fs_type)
return -ENODEV;
mnt = kern_mount(proc_fs_type);
put_filesystem(proc_fs_type);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
proc_mnt = mnt;
return 0;
}
/* /*
* Changed by mconsole_setup, which is __setup, and called before SMP is * Changed by mconsole_setup, which is __setup, and called before SMP is
* active. * active.
...@@ -696,6 +720,8 @@ static int __init mconsole_init(void) ...@@ -696,6 +720,8 @@ static int __init mconsole_init(void)
int err; int err;
char file[UNIX_PATH_MAX]; char file[UNIX_PATH_MAX];
mount_proc();
if (umid_file_name("mconsole", file, sizeof(file))) if (umid_file_name("mconsole", file, sizeof(file)))
return -1; return -1;
snprintf(mconsole_socket_name, sizeof(file), "%s", file); snprintf(mconsole_socket_name, sizeof(file), "%s", file);
......
...@@ -292,39 +292,3 @@ struct proc_dir_entry proc_root = { ...@@ -292,39 +292,3 @@ struct proc_dir_entry proc_root = {
.subdir = RB_ROOT, .subdir = RB_ROOT,
.name = "/proc", .name = "/proc",
}; };
int pid_ns_prepare_proc(struct pid_namespace *ns)
{
struct proc_fs_context *ctx;
struct fs_context *fc;
struct vfsmount *mnt;
fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT);
if (IS_ERR(fc))
return PTR_ERR(fc);
if (fc->user_ns != ns->user_ns) {
put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ns->user_ns);
}
ctx = fc->fs_private;
if (ctx->pid_ns != ns) {
put_pid_ns(ctx->pid_ns);
get_pid_ns(ns);
ctx->pid_ns = ns;
}
mnt = fc_mount(fc);
put_fs_context(fc);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
ns->proc_mnt = mnt;
return 0;
}
void pid_ns_release_proc(struct pid_namespace *ns)
{
kern_unmount(ns->proc_mnt);
}
...@@ -33,7 +33,6 @@ struct pid_namespace { ...@@ -33,7 +33,6 @@ struct pid_namespace {
unsigned int level; unsigned int level;
struct pid_namespace *parent; struct pid_namespace *parent;
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
struct vfsmount *proc_mnt;
struct dentry *proc_self; struct dentry *proc_self;
struct dentry *proc_thread_self; struct dentry *proc_thread_self;
#endif #endif
...@@ -42,7 +41,6 @@ struct pid_namespace { ...@@ -42,7 +41,6 @@ struct pid_namespace {
#endif #endif
struct user_namespace *user_ns; struct user_namespace *user_ns;
struct ucounts *ucounts; struct ucounts *ucounts;
struct work_struct proc_work;
kgid_t pid_gid; kgid_t pid_gid;
int hide_pid; int hide_pid;
int reboot; /* group exit code if this pidns was rebooted */ int reboot; /* group exit code if this pidns was rebooted */
......
...@@ -50,16 +50,11 @@ enum { ...@@ -50,16 +50,11 @@ enum {
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
extern int pid_ns_prepare_proc(struct pid_namespace *ns);
extern void pid_ns_release_proc(struct pid_namespace *ns);
extern int proc_alloc_inum(unsigned int *pino); extern int proc_alloc_inum(unsigned int *pino);
extern void proc_free_inum(unsigned int inum); extern void proc_free_inum(unsigned int inum);
#else /* CONFIG_PROC_FS */ #else /* CONFIG_PROC_FS */
static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; }
static inline void pid_ns_release_proc(struct pid_namespace *ns) {}
static inline int proc_alloc_inum(unsigned int *inum) static inline int proc_alloc_inum(unsigned int *inum)
{ {
*inum = 1; *inum = 1;
......
...@@ -144,9 +144,6 @@ void free_pid(struct pid *pid) ...@@ -144,9 +144,6 @@ void free_pid(struct pid *pid)
/* Handle a fork failure of the first process */ /* Handle a fork failure of the first process */
WARN_ON(ns->child_reaper); WARN_ON(ns->child_reaper);
ns->pid_allocated = 0; ns->pid_allocated = 0;
/* fall through */
case 0:
schedule_work(&ns->proc_work);
break; break;
} }
...@@ -247,11 +244,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, ...@@ -247,11 +244,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
tmp = tmp->parent; tmp = tmp->parent;
} }
if (unlikely(is_child_reaper(pid))) {
if (pid_ns_prepare_proc(ns))
goto out_free;
}
get_pid_ns(ns); get_pid_ns(ns);
refcount_set(&pid->count, 1); refcount_set(&pid->count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type) for (type = 0; type < PIDTYPE_MAX; ++type)
......
...@@ -57,12 +57,6 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) ...@@ -57,12 +57,6 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
return READ_ONCE(*pkc); return READ_ONCE(*pkc);
} }
static void proc_cleanup_work(struct work_struct *work)
{
struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
pid_ns_release_proc(ns);
}
static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
{ {
return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
...@@ -114,7 +108,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ...@@ -114,7 +108,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->user_ns = get_user_ns(user_ns); ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts; ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING; ns->pid_allocated = PIDNS_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);
return ns; return ns;
...@@ -231,20 +224,27 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) ...@@ -231,20 +224,27 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
} while (rc != -ECHILD); } while (rc != -ECHILD);
/* /*
* kernel_wait4() above can't reap the EXIT_DEAD children but we do not * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE
* really care, we could reparent them to the global init. We could * process whose parents processes are outside of the pid
* exit and reap ->child_reaper even if it is not the last thread in * namespace. Such processes are created with setns()+fork().
* this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), *
* pid_ns can not go away until proc_kill_sb() drops the reference. * If those EXIT_ZOMBIE processes are not reaped by their
* parents before their parents exit, they will be reparented
* to pid_ns->child_reaper. Thus pidns->child_reaper needs to
* stay valid until they all go away.
*
* The code relies on the the pid_ns->child_reaper ignoring
* SIGCHILD to cause those EXIT_ZOMBIE processes to be
* autoreaped if reparented.
* *
* But this ns can also have other tasks injected by setns()+fork(). * Semantically it is also desirable to wait for EXIT_ZOMBIE
* Again, ignoring the user visible semantics we do not really need * processes before allowing the child_reaper to be reaped, as
* to wait until they are all reaped, but they can be reparented to * that gives the invariant that when the init process of a
* us and thus we need to ensure that pid->child_reaper stays valid * pid namespace is reaped all of the processes in the pid
* until they all go away. See free_pid()->wake_up_process(). * namespace are gone.
* *
* We rely on ignored SIGCHLD, an injected zombie must be autoreaped * Once all of the other tasks are gone from the pid_namespace
* if reparented. * free_pid() will awaken this task.
*/ */
for (;;) { for (;;) {
set_current_state(TASK_INTERRUPTIBLE); set_current_state(TASK_INTERRUPTIBLE);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment