Commit ef2c41cf authored by Christian Brauner's avatar Christian Brauner Committed by Tejun Heo

clone3: allow spawning processes into cgroups

This adds support for creating a process in a different cgroup than its
parent. Callers can limit and account processes and threads right from
the moment they are spawned:
- A service manager can directly spawn new services into dedicated
  cgroups.
- A process can be directly created in a frozen cgroup and will be
  frozen as well.
- The initial accounting jitter experienced by process supervisors and
  daemons is eliminated with this.
- Threaded applications or even thread implementations can choose to
  create a specific cgroup layout where each thread is spawned
  directly into a dedicated cgroup.

This feature is limited to the unified hierarchy. Callers need to pass
a directory file descriptor for the target cgroup. The caller can
choose to pass an O_PATH file descriptor. All usual migration
restrictions apply, i.e. there can be no processes in inner nodes. In
general, creating a process directly in a target cgroup adheres to all
migration restrictions.

One of the biggest advantages of this feature is that CLONE_INTO_GROUP does
not need to grab the write side of the cgroup cgroup_threadgroup_rwsem.
This global lock makes moving tasks/threads around super expensive. With
clone3() this lock is avoided.

Cc: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: cgroups@vger.kernel.org
Signed-off-by: default avatarChristian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
parent f3553220
...@@ -628,8 +628,9 @@ struct cgroup_subsys { ...@@ -628,8 +628,9 @@ struct cgroup_subsys {
void (*cancel_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset);
void (*attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset);
void (*post_attach)(void); void (*post_attach)(void);
int (*can_fork)(struct task_struct *task); int (*can_fork)(struct task_struct *task,
void (*cancel_fork)(struct task_struct *task); struct css_set *cset);
void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
void (*fork)(struct task_struct *task); void (*fork)(struct task_struct *task);
void (*exit)(struct task_struct *task); void (*exit)(struct task_struct *task);
void (*release)(struct task_struct *task); void (*release)(struct task_struct *task);
......
...@@ -27,6 +27,8 @@ ...@@ -27,6 +27,8 @@
#include <linux/cgroup-defs.h> #include <linux/cgroup-defs.h>
struct kernel_clone_args;
#ifdef CONFIG_CGROUPS #ifdef CONFIG_CGROUPS
/* /*
...@@ -119,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, ...@@ -119,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk); struct pid *pid, struct task_struct *tsk);
void cgroup_fork(struct task_struct *p); void cgroup_fork(struct task_struct *p);
extern int cgroup_can_fork(struct task_struct *p); extern int cgroup_can_fork(struct task_struct *p,
extern void cgroup_cancel_fork(struct task_struct *p); struct kernel_clone_args *kargs);
extern void cgroup_post_fork(struct task_struct *p); extern void cgroup_cancel_fork(struct task_struct *p,
struct kernel_clone_args *kargs);
extern void cgroup_post_fork(struct task_struct *p,
struct kernel_clone_args *kargs);
void cgroup_exit(struct task_struct *p); void cgroup_exit(struct task_struct *p);
void cgroup_release(struct task_struct *p); void cgroup_release(struct task_struct *p);
void cgroup_free(struct task_struct *p); void cgroup_free(struct task_struct *p);
...@@ -705,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats, ...@@ -705,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry) { return -EINVAL; } struct dentry *dentry) { return -EINVAL; }
static inline void cgroup_fork(struct task_struct *p) {} static inline void cgroup_fork(struct task_struct *p) {}
static inline int cgroup_can_fork(struct task_struct *p) { return 0; } static inline int cgroup_can_fork(struct task_struct *p,
static inline void cgroup_cancel_fork(struct task_struct *p) {} struct kernel_clone_args *kargs) { return 0; }
static inline void cgroup_post_fork(struct task_struct *p) {} static inline void cgroup_cancel_fork(struct task_struct *p,
struct kernel_clone_args *kargs) {}
static inline void cgroup_post_fork(struct task_struct *p,
struct kernel_clone_args *kargs) {}
static inline void cgroup_exit(struct task_struct *p) {} static inline void cgroup_exit(struct task_struct *p) {}
static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_release(struct task_struct *p) {}
static inline void cgroup_free(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {}
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
struct task_struct; struct task_struct;
struct rusage; struct rusage;
union thread_union; union thread_union;
struct css_set;
/* All the bits taken by the old clone syscall. */ /* All the bits taken by the old clone syscall. */
#define CLONE_LEGACY_FLAGS 0xffffffffULL #define CLONE_LEGACY_FLAGS 0xffffffffULL
...@@ -29,6 +30,9 @@ struct kernel_clone_args { ...@@ -29,6 +30,9 @@ struct kernel_clone_args {
pid_t *set_tid; pid_t *set_tid;
/* Number of elements in *set_tid */ /* Number of elements in *set_tid */
size_t set_tid_size; size_t set_tid_size;
int cgroup;
struct cgroup *cgrp;
struct css_set *cset;
}; };
/* /*
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
/* Flags for the clone3() syscall. */ /* Flags for the clone3() syscall. */
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
/* /*
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3 * cloning flags intersect with CSIGNAL so can be used with unshare and clone3
...@@ -81,6 +82,8 @@ ...@@ -81,6 +82,8 @@
* @set_tid_size: This defines the size of the array referenced * @set_tid_size: This defines the size of the array referenced
* in @set_tid. This cannot be larger than the * in @set_tid. This cannot be larger than the
* kernel's limit of nested PID namespaces. * kernel's limit of nested PID namespaces.
* @cgroup: If CLONE_INTO_CGROUP is specified set this to
* a file descriptor for the cgroup.
* *
* The structure is versioned by size and thus extensible. * The structure is versioned by size and thus extensible.
* New struct members must go at the end of the struct and * New struct members must go at the end of the struct and
...@@ -97,11 +100,13 @@ struct clone_args { ...@@ -97,11 +100,13 @@ struct clone_args {
__aligned_u64 tls; __aligned_u64 tls;
__aligned_u64 set_tid; __aligned_u64 set_tid;
__aligned_u64 set_tid_size; __aligned_u64 set_tid_size;
__aligned_u64 cgroup;
}; };
#endif #endif
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ #define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
/* /*
* Scheduling policies * Scheduling policies
......
...@@ -5881,8 +5881,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, ...@@ -5881,8 +5881,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
* @child: pointer to task_struct of forking parent process. * @child: pointer to task_struct of forking parent process.
* *
* A task is associated with the init_css_set until cgroup_post_fork() * A task is associated with the init_css_set until cgroup_post_fork()
* attaches it to the parent's css_set. Empty cg_list indicates that * attaches it to the target css_set.
* @child isn't holding reference to its css_set.
*/ */
void cgroup_fork(struct task_struct *child) void cgroup_fork(struct task_struct *child)
{ {
...@@ -5908,24 +5907,154 @@ static struct cgroup *cgroup_get_from_file(struct file *f) ...@@ -5908,24 +5907,154 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
return cgrp; return cgrp;
} }
/**
* cgroup_css_set_fork - find or create a css_set for a child process
* @kargs: the arguments passed to create the child process
*
* This functions finds or creates a new css_set which the child
* process will be attached to in cgroup_post_fork(). By default,
* the child process will be given the same css_set as its parent.
*
* If CLONE_INTO_CGROUP is specified this function will try to find an
* existing css_set which includes the requested cgroup and if not create
* a new css_set that the child will be attached to later. If this function
* succeeds it will hold cgroup_threadgroup_rwsem on return. If
* CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
* before grabbing cgroup_threadgroup_rwsem and will hold a reference
* to the target cgroup.
*/
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
__acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
{
int ret;
struct cgroup *dst_cgrp = NULL;
struct css_set *cset;
struct super_block *sb;
struct file *f;
if (kargs->flags & CLONE_INTO_CGROUP)
mutex_lock(&cgroup_mutex);
cgroup_threadgroup_change_begin(current);
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
spin_unlock_irq(&css_set_lock);
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
kargs->cset = cset;
return 0;
}
f = fget_raw(kargs->cgroup);
if (!f) {
ret = -EBADF;
goto err;
}
sb = f->f_path.dentry->d_sb;
dst_cgrp = cgroup_get_from_file(f);
if (IS_ERR(dst_cgrp)) {
ret = PTR_ERR(dst_cgrp);
dst_cgrp = NULL;
goto err;
}
if (cgroup_is_dead(dst_cgrp)) {
ret = -ENODEV;
goto err;
}
/*
* Verify that we the target cgroup is writable for us. This is
* usually done by the vfs layer but since we're not going through
* the vfs layer here we need to do it "manually".
*/
ret = cgroup_may_write(dst_cgrp, sb);
if (ret)
goto err;
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
!(kargs->flags & CLONE_THREAD));
if (ret)
goto err;
kargs->cset = find_css_set(cset, dst_cgrp);
if (!kargs->cset) {
ret = -ENOMEM;
goto err;
}
put_css_set(cset);
fput(f);
kargs->cgrp = dst_cgrp;
return ret;
err:
cgroup_threadgroup_change_end(current);
mutex_unlock(&cgroup_mutex);
if (f)
fput(f);
if (dst_cgrp)
cgroup_put(dst_cgrp);
put_css_set(cset);
if (kargs->cset)
put_css_set(kargs->cset);
return ret;
}
/**
* cgroup_css_set_put_fork - drop references we took during fork
* @kargs: the arguments passed to create the child process
*
* Drop references to the prepared css_set and target cgroup if
* CLONE_INTO_CGROUP was requested.
*/
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
cgroup_threadgroup_change_end(current);
if (kargs->flags & CLONE_INTO_CGROUP) {
struct cgroup *cgrp = kargs->cgrp;
struct css_set *cset = kargs->cset;
mutex_unlock(&cgroup_mutex);
if (cset) {
put_css_set(cset);
kargs->cset = NULL;
}
if (cgrp) {
cgroup_put(cgrp);
kargs->cgrp = NULL;
}
}
}
/** /**
* cgroup_can_fork - called on a new task before the process is exposed * cgroup_can_fork - called on a new task before the process is exposed
* @child: the child process * @child: the child process
* *
* This prepares a new css_set for the child process which the child will
* be attached to in cgroup_post_fork().
* This calls the subsystem can_fork() callbacks. If the cgroup_can_fork() * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
* callback returns an error, the fork aborts with that error code. This * callback returns an error, the fork aborts with that error code. This
* allows for a cgroup subsystem to conditionally allow or deny new forks. * allows for a cgroup subsystem to conditionally allow or deny new forks.
*/ */
int cgroup_can_fork(struct task_struct *child) int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
__acquires(&cgroup_threadgroup_rwsem) __releases(&cgroup_threadgroup_rwsem)
{ {
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
int i, j, ret; int i, j, ret;
cgroup_threadgroup_change_begin(current); ret = cgroup_css_set_fork(kargs);
if (ret)
return ret;
do_each_subsys_mask(ss, i, have_canfork_callback) { do_each_subsys_mask(ss, i, have_canfork_callback) {
ret = ss->can_fork(child); ret = ss->can_fork(child, kargs->cset);
if (ret) if (ret)
goto out_revert; goto out_revert;
} while_each_subsys_mask(); } while_each_subsys_mask();
...@@ -5937,32 +6066,34 @@ int cgroup_can_fork(struct task_struct *child) ...@@ -5937,32 +6066,34 @@ int cgroup_can_fork(struct task_struct *child)
if (j >= i) if (j >= i)
break; break;
if (ss->cancel_fork) if (ss->cancel_fork)
ss->cancel_fork(child); ss->cancel_fork(child, kargs->cset);
} }
cgroup_threadgroup_change_end(current); cgroup_css_set_put_fork(kargs);
return ret; return ret;
} }
/** /**
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
* @child: the child process * @child: the child process
* * @kargs: the arguments passed to create the child process
* This calls the cancel_fork() callbacks if a fork failed *after* *
* cgroup_can_fork() succeded. * This calls the cancel_fork() callbacks if a fork failed *after*
*/ * cgroup_can_fork() succeded and cleans up references we took to
void cgroup_cancel_fork(struct task_struct *child) * prepare a new css_set for the child process in cgroup_can_fork().
__releases(&cgroup_threadgroup_rwsem) */
void cgroup_cancel_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
{ {
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
int i; int i;
for_each_subsys(ss, i) for_each_subsys(ss, i)
if (ss->cancel_fork) if (ss->cancel_fork)
ss->cancel_fork(child); ss->cancel_fork(child, kargs->cset);
cgroup_threadgroup_change_end(current); cgroup_css_set_put_fork(kargs);
} }
/** /**
...@@ -5972,22 +6103,27 @@ void cgroup_cancel_fork(struct task_struct *child) ...@@ -5972,22 +6103,27 @@ void cgroup_cancel_fork(struct task_struct *child)
* Attach the child process to its css_set calling the subsystem fork() * Attach the child process to its css_set calling the subsystem fork()
* callbacks. * callbacks.
*/ */
void cgroup_post_fork(struct task_struct *child) void cgroup_post_fork(struct task_struct *child,
__releases(&cgroup_threadgroup_rwsem) struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{ {
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
struct css_set *cset; struct css_set *cset;
int i; int i;
cset = kargs->cset;
kargs->cset = NULL;
spin_lock_irq(&css_set_lock); spin_lock_irq(&css_set_lock);
/* init tasks are special, only link regular threads */ /* init tasks are special, only link regular threads */
if (likely(child->pid)) { if (likely(child->pid)) {
WARN_ON_ONCE(!list_empty(&child->cg_list)); WARN_ON_ONCE(!list_empty(&child->cg_list));
cset = task_css_set(current); /* current is @child's parent */
get_css_set(cset);
cset->nr_tasks++; cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false); css_set_move_task(child, NULL, cset, false);
} else {
put_css_set(cset);
cset = NULL;
} }
/* /*
...@@ -6020,7 +6156,16 @@ void cgroup_post_fork(struct task_struct *child) ...@@ -6020,7 +6156,16 @@ void cgroup_post_fork(struct task_struct *child)
ss->fork(child); ss->fork(child);
} while_each_subsys_mask(); } while_each_subsys_mask();
cgroup_threadgroup_change_end(current); /* Make the new cset the root_cset of the new cgroup namespace. */
if (kargs->flags & CLONE_NEWCGROUP) {
struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
get_css_set(cset);
child->nsproxy->cgroup_ns->root_cset = cset;
put_css_set(rcset);
}
cgroup_css_set_put_fork(kargs);
} }
/** /**
......
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include <linux/atomic.h> #include <linux/atomic.h>
#include <linux/cgroup.h> #include <linux/cgroup.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/sched/task.h>
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL) #define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
#define PIDS_MAX_STR "max" #define PIDS_MAX_STR "max"
...@@ -214,13 +215,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) ...@@ -214,13 +215,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
* on cgroup_threadgroup_change_begin() held by the copy_process(). * on cgroup_threadgroup_change_begin() held by the copy_process().
*/ */
static int pids_can_fork(struct task_struct *task) static int pids_can_fork(struct task_struct *task, struct css_set *cset)
{ {
struct cgroup_subsys_state *css; struct cgroup_subsys_state *css;
struct pids_cgroup *pids; struct pids_cgroup *pids;
int err; int err;
css = task_css_check(current, pids_cgrp_id, true); if (cset)
css = cset->subsys[pids_cgrp_id];
else
css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css); pids = css_pids(css);
err = pids_try_charge(pids, 1); err = pids_try_charge(pids, 1);
if (err) { if (err) {
...@@ -235,12 +239,15 @@ static int pids_can_fork(struct task_struct *task) ...@@ -235,12 +239,15 @@ static int pids_can_fork(struct task_struct *task)
return err; return err;
} }
static void pids_cancel_fork(struct task_struct *task) static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
{ {
struct cgroup_subsys_state *css; struct cgroup_subsys_state *css;
struct pids_cgroup *pids; struct pids_cgroup *pids;
css = task_css_check(current, pids_cgrp_id, true); if (cset)
css = cset->subsys[pids_cgrp_id];
else
css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css); pids = css_pids(css);
pids_uncharge(pids, 1); pids_uncharge(pids, 1);
} }
......
...@@ -2180,7 +2180,7 @@ static __latent_entropy struct task_struct *copy_process( ...@@ -2180,7 +2180,7 @@ static __latent_entropy struct task_struct *copy_process(
* between here and cgroup_post_fork() if an organisation operation is in * between here and cgroup_post_fork() if an organisation operation is in
* progress. * progress.
*/ */
retval = cgroup_can_fork(p); retval = cgroup_can_fork(p, args);
if (retval) if (retval)
goto bad_fork_put_pidfd; goto bad_fork_put_pidfd;
...@@ -2287,7 +2287,7 @@ static __latent_entropy struct task_struct *copy_process( ...@@ -2287,7 +2287,7 @@ static __latent_entropy struct task_struct *copy_process(
write_unlock_irq(&tasklist_lock); write_unlock_irq(&tasklist_lock);
proc_fork_connector(p); proc_fork_connector(p);
cgroup_post_fork(p); cgroup_post_fork(p, args);
perf_event_fork(p); perf_event_fork(p);
trace_task_newtask(p, clone_flags); trace_task_newtask(p, clone_flags);
...@@ -2298,7 +2298,7 @@ static __latent_entropy struct task_struct *copy_process( ...@@ -2298,7 +2298,7 @@ static __latent_entropy struct task_struct *copy_process(
bad_fork_cancel_cgroup: bad_fork_cancel_cgroup:
spin_unlock(&current->sighand->siglock); spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock); write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p); cgroup_cancel_fork(p, args);
bad_fork_put_pidfd: bad_fork_put_pidfd:
if (clone_flags & CLONE_PIDFD) { if (clone_flags & CLONE_PIDFD) {
fput(pidfile); fput(pidfile);
...@@ -2627,6 +2627,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, ...@@ -2627,6 +2627,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
!valid_signal(args.exit_signal))) !valid_signal(args.exit_signal)))
return -EINVAL; return -EINVAL;
if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0)
return -EINVAL;
*kargs = (struct kernel_clone_args){ *kargs = (struct kernel_clone_args){
.flags = args.flags, .flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd), .pidfd = u64_to_user_ptr(args.pidfd),
...@@ -2637,6 +2640,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, ...@@ -2637,6 +2640,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
.stack_size = args.stack_size, .stack_size = args.stack_size,
.tls = args.tls, .tls = args.tls,
.set_tid_size = args.set_tid_size, .set_tid_size = args.set_tid_size,
.cgroup = args.cgroup,
}; };
if (args.set_tid && if (args.set_tid &&
...@@ -2680,7 +2684,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) ...@@ -2680,7 +2684,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
static bool clone3_args_valid(struct kernel_clone_args *kargs) static bool clone3_args_valid(struct kernel_clone_args *kargs)
{ {
/* Verify that no unknown flags are passed along. */ /* Verify that no unknown flags are passed along. */
if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND)) if (kargs->flags &
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
return false; return false;
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment