Commit 6b5f04b6 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:
 "cgroup changes for v4.6-rc1.  No userland visible behavior changes in
  this pull request.  I'll send out a separate pull request for the
  addition of cgroup namespace support.

   - The biggest change is the revamping of cgroup core task migration
     and controller handling logic.  There are quite a few places where
     controllers and tasks are manipulated.  Previously, many of those
     places implemented custom operations for each specific use case
     assuming specific starting conditions.  While this worked, it makes
     the code fragile and difficult to follow.

     The bulk of this pull request restructures these operations so that
     most related operations are performed through common helpers which
     implement recursive (subtrees are always processed consistently)
     and idempotent (they make cgroup hierarchy converge to the target
     state rather than performing operations assuming specific starting
     conditions).  This makes the code a lot easier to understand,
     verify and extend.

   - Implicit controller support is added.  This is primarily for using
     perf_event on the v2 hierarchy so that perf can match cgroup v2
     path without requiring the user to do anything special.  The kernel
     portion of perf_event changes is acked but userland changes are
     still pending review.

   - cgroup_no_v1= boot parameter added to ease testing cgroup v2 in
     certain environments.

   - There is a regression introduced during v4.4 devel cycle where
     attempts to migrate zombie tasks can mess up internal object
     management.  This was fixed earlier this week and included in this
     pull request w/ stable cc'd.

   - Misc non-critical fixes and improvements"

* 'for-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (44 commits)
  cgroup: avoid false positive gcc-6 warning
  cgroup: ignore css_sets associated with dead cgroups during migration
  Documentation: cgroup v2: Trivial heading correction.
  cgroup: implement cgroup_subsys->implicit_on_dfl
  cgroup: use css_set->mg_dst_cgrp for the migration target cgroup
  cgroup: make cgroup[_taskset]_migrate() take cgroup_root instead of cgroup
  cgroup: move migration destination verification out of cgroup_migrate_prepare_dst()
  cgroup: fix incorrect destination cgroup in cgroup_update_dfl_csses()
  cgroup: Trivial correction to reflect controller.
  cgroup: remove stale item in cgroup-v1 document INDEX file.
  cgroup: update css iteration in cgroup_update_dfl_csses()
  cgroup: allocate 2x cgrp_cset_links when setting up a new root
  cgroup: make cgroup_calc_subtree_ss_mask() take @this_ss_mask
  cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends
  cgroup: use cgroup_apply_enable_control() in cgroup creation path
  cgroup: combine cgroup_mutex locking and offline css draining
  cgroup: factor out cgroup_{apply|finalize}_control() from cgroup_subtree_control_write()
  cgroup: introduce cgroup_{save|propagate|restore}_control()
  cgroup: make cgroup_drain_offline() and cgroup_apply_control_{disable|enable}() recursive
  cgroup: factor out cgroup_apply_control_enable() from cgroup_subtree_control_write()
  ...
parents fcab86ad cfe02a8a
...@@ -24,5 +24,3 @@ net_prio.txt ...@@ -24,5 +24,3 @@ net_prio.txt
- Network priority cgroups details and usages. - Network priority cgroups details and usages.
pids.txt pids.txt
- Process number cgroups details and usages. - Process number cgroups details and usages.
unified-hierarchy.txt
- Description the new/next cgroup interface.
...@@ -132,6 +132,12 @@ strongly discouraged for production use. It is recommended to decide ...@@ -132,6 +132,12 @@ strongly discouraged for production use. It is recommended to decide
the hierarchies and controller associations before starting using the the hierarchies and controller associations before starting using the
controllers after system boot. controllers after system boot.
During transition to v2, system management software might still
automount the v1 cgroup filesystem and so hijack all controllers
during boot, before manual intervention is possible. To make testing
and experimenting easier, the kernel parameter cgroup_no_v1= allows
disabling controllers in v1 and make them always available in v2.
2-2. Organizing Processes 2-2. Organizing Processes
...@@ -915,7 +921,7 @@ PAGE_SIZE multiple when read back. ...@@ -915,7 +921,7 @@ PAGE_SIZE multiple when read back.
limit, anonymous meomry of the cgroup will not be swapped out. limit, anonymous meomry of the cgroup will not be swapped out.
5-2-2. General Usage 5-2-2. Usage Guidelines
"memory.high" is the main mechanism to control memory usage. "memory.high" is the main mechanism to control memory usage.
Over-committing on high limit (sum of high limits > available memory) Over-committing on high limit (sum of high limits > available memory)
......
...@@ -614,6 +614,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ...@@ -614,6 +614,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
cut the overhead, others just disable the usage. So cut the overhead, others just disable the usage. So
only cgroup_disable=memory is actually worthy} only cgroup_disable=memory is actually worthy}
cgroup_no_v1= [KNL] Disable one, multiple, all cgroup controllers in v1
Format: { controller[,controller...] | "all" }
Like cgroup_disable, but only applies to cgroup v1;
the blacklisted controllers remain available in cgroup2.
cgroup.memory= [KNL] Pass options to the cgroup memory controller. cgroup.memory= [KNL] Pass options to the cgroup memory controller.
Format: <string> Format: <string>
nosocket -- Disable socket memory accounting. nosocket -- Disable socket memory accounting.
......
...@@ -45,6 +45,7 @@ enum { ...@@ -45,6 +45,7 @@ enum {
CSS_NO_REF = (1 << 0), /* no reference counting for this css */ CSS_NO_REF = (1 << 0), /* no reference counting for this css */
CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */
CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */
CSS_VISIBLE = (1 << 3), /* css is visible to userland */
}; };
/* bits in struct cgroup flags field */ /* bits in struct cgroup flags field */
...@@ -190,12 +191,13 @@ struct css_set { ...@@ -190,12 +191,13 @@ struct css_set {
/* /*
* If this cset is acting as the source of migration the following * If this cset is acting as the source of migration the following
* two fields are set. mg_src_cgrp is the source cgroup of the * two fields are set. mg_src_cgrp and mg_dst_cgrp are
* on-going migration and mg_dst_cset is the destination cset the * respectively the source and destination cgroups of the on-going
* target tasks on this cset should be migrated to. Protected by * migration. mg_dst_cset is the destination cset the target tasks
* cgroup_mutex. * on this cset should be migrated to. Protected by cgroup_mutex.
*/ */
struct cgroup *mg_src_cgrp; struct cgroup *mg_src_cgrp;
struct cgroup *mg_dst_cgrp;
struct css_set *mg_dst_cset; struct css_set *mg_dst_cset;
/* /*
...@@ -210,6 +212,9 @@ struct css_set { ...@@ -210,6 +212,9 @@ struct css_set {
/* all css_task_iters currently walking this cset */ /* all css_task_iters currently walking this cset */
struct list_head task_iters; struct list_head task_iters;
/* dead and being drained, ignore for migration */
bool dead;
/* For RCU-protected deletion */ /* For RCU-protected deletion */
struct rcu_head rcu_head; struct rcu_head rcu_head;
}; };
...@@ -253,13 +258,14 @@ struct cgroup { ...@@ -253,13 +258,14 @@ struct cgroup {
/* /*
* The bitmask of subsystems enabled on the child cgroups. * The bitmask of subsystems enabled on the child cgroups.
* ->subtree_control is the one configured through * ->subtree_control is the one configured through
* "cgroup.subtree_control" while ->child_subsys_mask is the * "cgroup.subtree_control" while ->child_ss_mask is the effective
* effective one which may have more subsystems enabled. * one which may have more subsystems enabled. Controller knobs
* Controller knobs are made available iff it's enabled in * are made available iff it's enabled in ->subtree_control.
* ->subtree_control.
*/ */
unsigned int subtree_control; u16 subtree_control;
unsigned int child_subsys_mask; u16 subtree_ss_mask;
u16 old_subtree_control;
u16 old_subtree_ss_mask;
/* Private pointers for each registered subsystem */ /* Private pointers for each registered subsystem */
struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
...@@ -434,7 +440,6 @@ struct cgroup_subsys { ...@@ -434,7 +440,6 @@ struct cgroup_subsys {
void (*css_released)(struct cgroup_subsys_state *css); void (*css_released)(struct cgroup_subsys_state *css);
void (*css_free)(struct cgroup_subsys_state *css); void (*css_free)(struct cgroup_subsys_state *css);
void (*css_reset)(struct cgroup_subsys_state *css); void (*css_reset)(struct cgroup_subsys_state *css);
void (*css_e_css_changed)(struct cgroup_subsys_state *css);
int (*can_attach)(struct cgroup_taskset *tset); int (*can_attach)(struct cgroup_taskset *tset);
void (*cancel_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset);
...@@ -446,7 +451,20 @@ struct cgroup_subsys { ...@@ -446,7 +451,20 @@ struct cgroup_subsys {
void (*free)(struct task_struct *task); void (*free)(struct task_struct *task);
void (*bind)(struct cgroup_subsys_state *root_css); void (*bind)(struct cgroup_subsys_state *root_css);
int early_init; bool early_init:1;
/*
* If %true, the controller, on the default hierarchy, doesn't show
* up in "cgroup.controllers" or "cgroup.subtree_control", is
* implicitly enabled on all cgroups on the default hierarchy, and
* bypasses the "no internal process" constraint. This is for
* utility type controllers which is transparent to userland.
*
* An implicit controller can be stolen from the default hierarchy
* anytime and thus must be okay with offline csses from previous
* hierarchies coexisting with csses for the current one.
*/
bool implicit_on_dfl:1;
/* /*
* If %false, this subsystem is properly hierarchical - * If %false, this subsystem is properly hierarchical -
...@@ -460,8 +478,8 @@ struct cgroup_subsys { ...@@ -460,8 +478,8 @@ struct cgroup_subsys {
* cases. Eventually, all subsystems will be made properly * cases. Eventually, all subsystems will be made properly
* hierarchical and this will go away. * hierarchical and this will go away.
*/ */
bool broken_hierarchy; bool broken_hierarchy:1;
bool warned_broken_hierarchy; bool warned_broken_hierarchy:1;
/* the following two fields are initialized automtically during boot */ /* the following two fields are initialized automtically during boot */
int id; int id;
......
...@@ -1047,10 +1047,10 @@ config CGROUP_PIDS ...@@ -1047,10 +1047,10 @@ config CGROUP_PIDS
is fairly trivial to reach PID exhaustion before you reach even a is fairly trivial to reach PID exhaustion before you reach even a
conservative kmemcg limit. As a result, it is possible to grind a conservative kmemcg limit. As a result, it is possible to grind a
system to halt without being limited by other cgroup policies. The system to halt without being limited by other cgroup policies. The
PIDs cgroup subsystem is designed to stop this from happening. PIDs controller is designed to stop this from happening.
It should be noted that organisational operations (such as attaching It should be noted that organisational operations (such as attaching
to a cgroup hierarchy will *not* be blocked by the PIDs subsystem), to a cgroup hierarchy will *not* be blocked by the PIDs controller),
since the PIDs limit only affects a process's ability to fork, not to since the PIDs limit only affects a process's ability to fork, not to
attach to a cgroup. attach to a cgroup.
......
...@@ -14,8 +14,7 @@ obj-y = fork.o exec_domain.o panic.o \ ...@@ -14,8 +14,7 @@ obj-y = fork.o exec_domain.o panic.o \
obj-$(CONFIG_MULTIUSER) += groups.o obj-$(CONFIG_MULTIUSER) += groups.o
ifdef CONFIG_FUNCTION_TRACER ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files # Do not trace internal ftrace files
CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
endif endif
......
...@@ -178,10 +178,16 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); ...@@ -178,10 +178,16 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
* The default hierarchy always exists but is hidden until mounted for the * The default hierarchy always exists but is hidden until mounted for the
* first time. This is for backward compatibility. * first time. This is for backward compatibility.
*/ */
static bool cgrp_dfl_root_visible; static bool cgrp_dfl_visible;
/* Controllers blocked by the commandline in v1 */
static u16 cgroup_no_v1_mask;
/* some controllers are not supported in the default hierarchy */ /* some controllers are not supported in the default hierarchy */
static unsigned long cgrp_dfl_root_inhibit_ss_mask; static u16 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */
static unsigned long cgrp_dfl_implicit_ss_mask;
/* The list of hierarchy roots */ /* The list of hierarchy roots */
...@@ -205,23 +211,25 @@ static u64 css_serial_nr_next = 1; ...@@ -205,23 +211,25 @@ static u64 css_serial_nr_next = 1;
* fork/exit handlers to call. This avoids us having to do extra work in the * fork/exit handlers to call. This avoids us having to do extra work in the
* fork/exit path to check which subsystems have fork/exit callbacks. * fork/exit path to check which subsystems have fork/exit callbacks.
*/ */
static unsigned long have_fork_callback __read_mostly; static u16 have_fork_callback __read_mostly;
static unsigned long have_exit_callback __read_mostly; static u16 have_exit_callback __read_mostly;
static unsigned long have_free_callback __read_mostly; static u16 have_free_callback __read_mostly;
/* Ditto for the can_fork callback. */ /* Ditto for the can_fork callback. */
static unsigned long have_canfork_callback __read_mostly; static u16 have_canfork_callback __read_mostly;
static struct file_system_type cgroup2_fs_type; static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[]; static struct cftype cgroup_legacy_base_files[];
static int rebind_subsystems(struct cgroup_root *dst_root, static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
unsigned long ss_mask); static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_advance(struct css_task_iter *it); static void css_task_iter_advance(struct css_task_iter *it);
static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp);
static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
bool visible); struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref); static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css); static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css, static int cgroup_addrm_files(struct cgroup_subsys_state *css,
...@@ -238,9 +246,17 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, ...@@ -238,9 +246,17 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
*/ */
static bool cgroup_ssid_enabled(int ssid) static bool cgroup_ssid_enabled(int ssid)
{ {
if (CGROUP_SUBSYS_COUNT == 0)
return false;
return static_key_enabled(cgroup_subsys_enabled_key[ssid]); return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
} }
static bool cgroup_ssid_no_v1(int ssid)
{
return cgroup_no_v1_mask & (1 << ssid);
}
/** /**
* cgroup_on_dfl - test whether a cgroup is on the default hierarchy * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
* @cgrp: the cgroup of interest * @cgrp: the cgroup of interest
...@@ -339,6 +355,32 @@ static struct cgroup *cgroup_parent(struct cgroup *cgrp) ...@@ -339,6 +355,32 @@ static struct cgroup *cgroup_parent(struct cgroup *cgrp)
return NULL; return NULL;
} }
/* subsystems visibly enabled on a cgroup */
static u16 cgroup_control(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
u16 root_ss_mask = cgrp->root->subsys_mask;
if (parent)
return parent->subtree_control;
if (cgroup_on_dfl(cgrp))
root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
cgrp_dfl_implicit_ss_mask);
return root_ss_mask;
}
/* subsystems enabled on a cgroup */
static u16 cgroup_ss_mask(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
if (parent)
return parent->subtree_ss_mask;
return cgrp->root->subsys_mask;
}
/** /**
* cgroup_css - obtain a cgroup's css for the specified subsystem * cgroup_css - obtain a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest * @cgrp: the cgroup of interest
...@@ -378,16 +420,15 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, ...@@ -378,16 +420,15 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
if (!ss) if (!ss)
return &cgrp->self; return &cgrp->self;
if (!(cgrp->root->subsys_mask & (1 << ss->id)))
return NULL;
/* /*
* This function is used while updating css associations and thus * This function is used while updating css associations and thus
* can't test the csses directly. Use ->child_subsys_mask. * can't test the csses directly. Test ss_mask.
*/ */
while (cgroup_parent(cgrp) && while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
!(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
cgrp = cgroup_parent(cgrp); cgrp = cgroup_parent(cgrp);
if (!cgrp)
return NULL;
}
return cgroup_css(cgrp, ss); return cgroup_css(cgrp, ss);
} }
...@@ -506,22 +547,28 @@ static int notify_on_release(const struct cgroup *cgrp) ...@@ -506,22 +547,28 @@ static int notify_on_release(const struct cgroup *cgrp)
(((ss) = cgroup_subsys[ssid]) || true); (ssid)++) (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
/** /**
* for_each_subsys_which - filter for_each_subsys with a bitmask * do_each_subsys_mask - filter for_each_subsys with a bitmask
* @ss: the iteration cursor * @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
* @ss_maskp: a pointer to the bitmask * @ss_mask: the bitmask
* *
* The block will only run for cases where the ssid-th bit (1 << ssid) of * The block will only run for cases where the ssid-th bit (1 << ssid) of
* mask is set to 1. * @ss_mask is set.
*/ */
#define for_each_subsys_which(ss, ssid, ss_maskp) \ #define do_each_subsys_mask(ss, ssid, ss_mask) do { \
if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \ unsigned long __ss_mask = (ss_mask); \
if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
(ssid) = 0; \ (ssid) = 0; \
else \
for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \
if (((ss) = cgroup_subsys[ssid]) && false) \
break; \ break; \
else } \
for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
(ss) = cgroup_subsys[ssid]; \
{
#define while_each_subsys_mask() \
} \
} \
} while (false)
/* iterate across the hierarchies */ /* iterate across the hierarchies */
#define for_each_root(root) \ #define for_each_root(root) \
...@@ -535,6 +582,24 @@ static int notify_on_release(const struct cgroup *cgrp) ...@@ -535,6 +582,24 @@ static int notify_on_release(const struct cgroup *cgrp)
; \ ; \
else else
/* walk live descendants in preorder */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
if (({ lockdep_assert_held(&cgroup_mutex); \
(dsct) = (d_css)->cgroup; \
cgroup_is_dead(dsct); })) \
; \
else
/* walk live descendants in postorder */
#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
if (({ lockdep_assert_held(&cgroup_mutex); \
(dsct) = (d_css)->cgroup; \
cgroup_is_dead(dsct); })) \
; \
else
static void cgroup_release_agent(struct work_struct *work); static void cgroup_release_agent(struct work_struct *work);
static void check_for_release(struct cgroup *cgrp); static void check_for_release(struct cgroup *cgrp);
...@@ -665,6 +730,9 @@ static void css_set_move_task(struct task_struct *task, ...@@ -665,6 +730,9 @@ static void css_set_move_task(struct task_struct *task,
{ {
lockdep_assert_held(&css_set_lock); lockdep_assert_held(&css_set_lock);
if (to_cset && !css_set_populated(to_cset))
css_set_update_populated(to_cset, true);
if (from_cset) { if (from_cset) {
struct css_task_iter *it, *pos; struct css_task_iter *it, *pos;
...@@ -698,8 +766,6 @@ static void css_set_move_task(struct task_struct *task, ...@@ -698,8 +766,6 @@ static void css_set_move_task(struct task_struct *task,
*/ */
WARN_ON_ONCE(task->flags & PF_EXITING); WARN_ON_ONCE(task->flags & PF_EXITING);
if (!css_set_populated(to_cset))
css_set_update_populated(to_cset, true);
rcu_assign_pointer(task->cgroups, to_cset); rcu_assign_pointer(task->cgroups, to_cset);
list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
&to_cset->tasks); &to_cset->tasks);
...@@ -1102,13 +1168,13 @@ static void cgroup_destroy_root(struct cgroup_root *root) ...@@ -1102,13 +1168,13 @@ static void cgroup_destroy_root(struct cgroup_root *root)
struct cgroup *cgrp = &root->cgrp; struct cgroup *cgrp = &root->cgrp;
struct cgrp_cset_link *link, *tmp_link; struct cgrp_cset_link *link, *tmp_link;
mutex_lock(&cgroup_mutex); cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(atomic_read(&root->nr_cgrps));
BUG_ON(!list_empty(&cgrp->self.children)); BUG_ON(!list_empty(&cgrp->self.children));
/* Rebind all subsystems back to the default hierarchy */ /* Rebind all subsystems back to the default hierarchy */
rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
/* /*
* Release all the links from cset_links to this hierarchy's * Release all the links from cset_links to this hierarchy's
...@@ -1248,46 +1314,40 @@ static umode_t cgroup_file_mode(const struct cftype *cft) ...@@ -1248,46 +1314,40 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
} }
/** /**
* cgroup_calc_child_subsys_mask - calculate child_subsys_mask * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
* @cgrp: the target cgroup
* @subtree_control: the new subtree_control mask to consider * @subtree_control: the new subtree_control mask to consider
* @this_ss_mask: available subsystems
* *
* On the default hierarchy, a subsystem may request other subsystems to be * On the default hierarchy, a subsystem may request other subsystems to be
* enabled together through its ->depends_on mask. In such cases, more * enabled together through its ->depends_on mask. In such cases, more
* subsystems than specified in "cgroup.subtree_control" may be enabled. * subsystems than specified in "cgroup.subtree_control" may be enabled.
* *
* This function calculates which subsystems need to be enabled if * This function calculates which subsystems need to be enabled if
* @subtree_control is to be applied to @cgrp. The returned mask is always * @subtree_control is to be applied while restricted to @this_ss_mask.
* a superset of @subtree_control and follows the usual hierarchy rules.
*/ */
static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
unsigned long subtree_control)
{ {
struct cgroup *parent = cgroup_parent(cgrp); u16 cur_ss_mask = subtree_control;
unsigned long cur_ss_mask = subtree_control;
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
int ssid; int ssid;
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_mutex);
if (!cgroup_on_dfl(cgrp)) cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
return cur_ss_mask;
while (true) { while (true) {
unsigned long new_ss_mask = cur_ss_mask; u16 new_ss_mask = cur_ss_mask;
for_each_subsys_which(ss, ssid, &cur_ss_mask) do_each_subsys_mask(ss, ssid, cur_ss_mask) {
new_ss_mask |= ss->depends_on; new_ss_mask |= ss->depends_on;
} while_each_subsys_mask();
/* /*
* Mask out subsystems which aren't available. This can * Mask out subsystems which aren't available. This can
* happen only if some depended-upon subsystems were bound * happen only if some depended-upon subsystems were bound
* to non-default hierarchies. * to non-default hierarchies.
*/ */
if (parent) new_ss_mask &= this_ss_mask;
new_ss_mask &= parent->child_subsys_mask;
else
new_ss_mask &= cgrp->root->subsys_mask;
if (new_ss_mask == cur_ss_mask) if (new_ss_mask == cur_ss_mask)
break; break;
...@@ -1297,19 +1357,6 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, ...@@ -1297,19 +1357,6 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
return cur_ss_mask; return cur_ss_mask;
} }
/**
* cgroup_refresh_child_subsys_mask - update child_subsys_mask
* @cgrp: the target cgroup
*
* Update @cgrp->child_subsys_mask according to the current
* @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
*/
static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
{
cgrp->child_subsys_mask =
cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
}
/** /**
* cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced * @kn: the kernfs_node being serviced
...@@ -1338,19 +1385,22 @@ static void cgroup_kn_unlock(struct kernfs_node *kn) ...@@ -1338,19 +1385,22 @@ static void cgroup_kn_unlock(struct kernfs_node *kn)
/** /**
* cgroup_kn_lock_live - locking helper for cgroup kernfs methods * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced * @kn: the kernfs_node being serviced
* @drain_offline: perform offline draining on the cgroup
* *
* This helper is to be used by a cgroup kernfs method currently servicing * This helper is to be used by a cgroup kernfs method currently servicing
* @kn. It breaks the active protection, performs cgroup locking and * @kn. It breaks the active protection, performs cgroup locking and
* verifies that the associated cgroup is alive. Returns the cgroup if * verifies that the associated cgroup is alive. Returns the cgroup if
* alive; otherwise, %NULL. A successful return should be undone by a * alive; otherwise, %NULL. A successful return should be undone by a
* matching cgroup_kn_unlock() invocation. * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the
* cgroup is drained of offlining csses before return.
* *
* Any cgroup kernfs method implementation which requires locking the * Any cgroup kernfs method implementation which requires locking the
* associated cgroup should use this helper. It avoids nesting cgroup * associated cgroup should use this helper. It avoids nesting cgroup
* locking under kernfs active protection and allows all kernfs operations * locking under kernfs active protection and allows all kernfs operations
* including self-removal. * including self-removal.
*/ */
static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
bool drain_offline)
{ {
struct cgroup *cgrp; struct cgroup *cgrp;
...@@ -1369,6 +1419,9 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) ...@@ -1369,6 +1419,9 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
return NULL; return NULL;
kernfs_break_active_protection(kn); kernfs_break_active_protection(kn);
if (drain_offline)
cgroup_lock_and_drain_offline(cgrp);
else
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
if (!cgroup_is_dead(cgrp)) if (!cgroup_is_dead(cgrp))
...@@ -1399,14 +1452,17 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) ...@@ -1399,14 +1452,17 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
/** /**
* css_clear_dir - remove subsys files in a cgroup directory * css_clear_dir - remove subsys files in a cgroup directory
* @css: taget css * @css: taget css
* @cgrp_override: specify if target cgroup is different from css->cgroup
*/ */
static void css_clear_dir(struct cgroup_subsys_state *css, static void css_clear_dir(struct cgroup_subsys_state *css)
struct cgroup *cgrp_override)
{ {
struct cgroup *cgrp = cgrp_override ?: css->cgroup; struct cgroup *cgrp = css->cgroup;
struct cftype *cfts; struct cftype *cfts;
if (!(css->flags & CSS_VISIBLE))
return;
css->flags &= ~CSS_VISIBLE;
list_for_each_entry(cfts, &css->ss->cfts, node) list_for_each_entry(cfts, &css->ss->cfts, node)
cgroup_addrm_files(css, cgrp, cfts, false); cgroup_addrm_files(css, cgrp, cfts, false);
} }
...@@ -1414,17 +1470,18 @@ static void css_clear_dir(struct cgroup_subsys_state *css, ...@@ -1414,17 +1470,18 @@ static void css_clear_dir(struct cgroup_subsys_state *css,
/** /**
* css_populate_dir - create subsys files in a cgroup directory * css_populate_dir - create subsys files in a cgroup directory
* @css: target css * @css: target css
* @cgrp_overried: specify if target cgroup is different from css->cgroup
* *
* On failure, no file is added. * On failure, no file is added.
*/ */
static int css_populate_dir(struct cgroup_subsys_state *css, static int css_populate_dir(struct cgroup_subsys_state *css)
struct cgroup *cgrp_override)
{ {
struct cgroup *cgrp = cgrp_override ?: css->cgroup; struct cgroup *cgrp = css->cgroup;
struct cftype *cfts, *failed_cfts; struct cftype *cfts, *failed_cfts;
int ret; int ret;
if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
return 0;
if (!css->ss) { if (!css->ss) {
if (cgroup_on_dfl(cgrp)) if (cgroup_on_dfl(cgrp))
cfts = cgroup_dfl_base_files; cfts = cgroup_dfl_base_files;
...@@ -1441,6 +1498,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css, ...@@ -1441,6 +1498,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css,
goto err; goto err;
} }
} }
css->flags |= CSS_VISIBLE;
return 0; return 0;
err: err:
list_for_each_entry(cfts, &css->ss->cfts, node) { list_for_each_entry(cfts, &css->ss->cfts, node) {
...@@ -1451,67 +1511,30 @@ static int css_populate_dir(struct cgroup_subsys_state *css, ...@@ -1451,67 +1511,30 @@ static int css_populate_dir(struct cgroup_subsys_state *css,
return ret; return ret;
} }
static int rebind_subsystems(struct cgroup_root *dst_root, static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
unsigned long ss_mask)
{ {
struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
unsigned long tmp_ss_mask;
int ssid, i, ret; int ssid, i, ret;
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_mutex);
for_each_subsys_which(ss, ssid, &ss_mask) { do_each_subsys_mask(ss, ssid, ss_mask) {
/* if @ss has non-root csses attached to it, can't move */ /*
if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) * If @ss has non-root csses attached to it, can't move.
* If @ss is an implicit controller, it is exempt from this
* rule and can be stolen.
*/
if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
!ss->implicit_on_dfl)
return -EBUSY; return -EBUSY;
/* can't move between two non-dummy roots either */ /* can't move between two non-dummy roots either */
if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY; return -EBUSY;
} } while_each_subsys_mask();
/* skip creating root files on dfl_root for inhibited subsystems */
tmp_ss_mask = ss_mask;
if (dst_root == &cgrp_dfl_root)
tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
for_each_subsys_which(ss, ssid, &tmp_ss_mask) { do_each_subsys_mask(ss, ssid, ss_mask) {
struct cgroup *scgrp = &ss->root->cgrp;
int tssid;
ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
if (!ret)
continue;
/*
* Rebinding back to the default root is not allowed to
* fail. Using both default and non-default roots should
* be rare. Moving subsystems back and forth even more so.
* Just warn about it and continue.
*/
if (dst_root == &cgrp_dfl_root) {
if (cgrp_dfl_root_visible) {
pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
ret, ss_mask);
pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
}
continue;
}
for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
if (tssid == ssid)
break;
css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
}
return ret;
}
/*
* Nothing can fail from this point on. Remove files for the
* removed subsystems and rebind each subsystem.
*/
for_each_subsys_which(ss, ssid, &ss_mask) {
struct cgroup_root *src_root = ss->root; struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp; struct cgroup *scgrp = &src_root->cgrp;
struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
...@@ -1519,8 +1542,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, ...@@ -1519,8 +1542,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
WARN_ON(!css || cgroup_css(dcgrp, ss)); WARN_ON(!css || cgroup_css(dcgrp, ss));
css_clear_dir(css, NULL); /* disable from the source */
src_root->subsys_mask &= ~(1 << ssid);
WARN_ON(cgroup_apply_control(scgrp));
cgroup_finalize_control(scgrp, 0);
/* rebind */
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
rcu_assign_pointer(dcgrp->subsys[ssid], css); rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root; ss->root = dst_root;
...@@ -1532,23 +1559,23 @@ static int rebind_subsystems(struct cgroup_root *dst_root, ...@@ -1532,23 +1559,23 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
&dcgrp->e_csets[ss->id]); &dcgrp->e_csets[ss->id]);
spin_unlock_bh(&css_set_lock); spin_unlock_bh(&css_set_lock);
src_root->subsys_mask &= ~(1 << ssid);
scgrp->subtree_control &= ~(1 << ssid);
cgroup_refresh_child_subsys_mask(scgrp);
/* default hierarchy doesn't enable controllers by default */ /* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid; dst_root->subsys_mask |= 1 << ssid;
if (dst_root == &cgrp_dfl_root) { if (dst_root == &cgrp_dfl_root) {
static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
} else { } else {
dcgrp->subtree_control |= 1 << ssid; dcgrp->subtree_control |= 1 << ssid;
cgroup_refresh_child_subsys_mask(dcgrp);
static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
} }
ret = cgroup_apply_control(dcgrp);
if (ret)
pr_warn("partial failure to rebind %s controller (err=%d)\n",
ss->name, ret);
if (ss->bind) if (ss->bind)
ss->bind(css); ss->bind(css);
} } while_each_subsys_mask();
kernfs_activate(dcgrp->kn); kernfs_activate(dcgrp->kn);
return 0; return 0;
...@@ -1584,7 +1611,7 @@ static int cgroup_show_options(struct seq_file *seq, ...@@ -1584,7 +1611,7 @@ static int cgroup_show_options(struct seq_file *seq,
} }
struct cgroup_sb_opts { struct cgroup_sb_opts {
unsigned long subsys_mask; u16 subsys_mask;
unsigned int flags; unsigned int flags;
char *release_agent; char *release_agent;
bool cpuset_clone_children; bool cpuset_clone_children;
...@@ -1597,13 +1624,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) ...@@ -1597,13 +1624,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{ {
char *token, *o = data; char *token, *o = data;
bool all_ss = false, one_ss = false; bool all_ss = false, one_ss = false;
unsigned long mask = -1UL; u16 mask = U16_MAX;
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
int nr_opts = 0; int nr_opts = 0;
int i; int i;
#ifdef CONFIG_CPUSETS #ifdef CONFIG_CPUSETS
mask = ~(1U << cpuset_cgrp_id); mask = ~((u16)1 << cpuset_cgrp_id);
#endif #endif
memset(opts, 0, sizeof(*opts)); memset(opts, 0, sizeof(*opts));
...@@ -1678,6 +1705,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) ...@@ -1678,6 +1705,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
continue; continue;
if (!cgroup_ssid_enabled(i)) if (!cgroup_ssid_enabled(i))
continue; continue;
if (cgroup_ssid_no_v1(i))
continue;
/* Mutually exclusive option 'all' + subsystem name */ /* Mutually exclusive option 'all' + subsystem name */
if (all_ss) if (all_ss)
...@@ -1698,7 +1727,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) ...@@ -1698,7 +1727,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
*/ */
if (all_ss || (!one_ss && !opts->none && !opts->name)) if (all_ss || (!one_ss && !opts->none && !opts->name))
for_each_subsys(ss, i) for_each_subsys(ss, i)
if (cgroup_ssid_enabled(i)) if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
opts->subsys_mask |= (1 << i); opts->subsys_mask |= (1 << i);
/* /*
...@@ -1728,14 +1757,14 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) ...@@ -1728,14 +1757,14 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
int ret = 0; int ret = 0;
struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts; struct cgroup_sb_opts opts;
unsigned long added_mask, removed_mask; u16 added_mask, removed_mask;
if (root == &cgrp_dfl_root) { if (root == &cgrp_dfl_root) {
pr_err("remount is not allowed\n"); pr_err("remount is not allowed\n");
return -EINVAL; return -EINVAL;
} }
mutex_lock(&cgroup_mutex); cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* See what subsystems are wanted */ /* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts); ret = parse_cgroupfs_options(data, &opts);
...@@ -1768,7 +1797,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) ...@@ -1768,7 +1797,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
if (ret) if (ret)
goto out_unlock; goto out_unlock;
rebind_subsystems(&cgrp_dfl_root, removed_mask); WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
if (opts.release_agent) { if (opts.release_agent) {
spin_lock(&release_agent_path_lock); spin_lock(&release_agent_path_lock);
...@@ -1876,7 +1905,7 @@ static void init_cgroup_root(struct cgroup_root *root, ...@@ -1876,7 +1905,7 @@ static void init_cgroup_root(struct cgroup_root *root,
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
} }
static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{ {
LIST_HEAD(tmp_links); LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp; struct cgroup *root_cgrp = &root->cgrp;
...@@ -1899,10 +1928,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) ...@@ -1899,10 +1928,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
/* /*
* We're accessing css_set_count without locking css_set_lock here, * We're accessing css_set_count without locking css_set_lock here,
* but that's OK - it can only be increased by someone holding * but that's OK - it can only be increased by someone holding
* cgroup_lock, and that's us. The worst that can happen is that we * cgroup_lock, and that's us. Later rebinding may disable
* have some link structures left over * controllers on the default hierarchy and thus create new csets,
* which can't be more than the existing ones. Allocate 2x.
*/ */
ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
if (ret) if (ret)
goto cancel_ref; goto cancel_ref;
...@@ -1919,7 +1949,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) ...@@ -1919,7 +1949,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
} }
root_cgrp->kn = root->kf_root->kn; root_cgrp->kn = root->kf_root->kn;
ret = css_populate_dir(&root_cgrp->self, NULL); ret = css_populate_dir(&root_cgrp->self);
if (ret) if (ret)
goto destroy_root; goto destroy_root;
...@@ -1992,13 +2022,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, ...@@ -1992,13 +2022,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
cgrp_dfl_root_visible = true; cgrp_dfl_visible = true;
root = &cgrp_dfl_root; root = &cgrp_dfl_root;
cgroup_get(&root->cgrp); cgroup_get(&root->cgrp);
goto out_mount; goto out_mount;
} }
mutex_lock(&cgroup_mutex); cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* First find the desired set of subsystems */ /* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts); ret = parse_cgroupfs_options(data, &opts);
...@@ -2338,38 +2368,38 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, ...@@ -2338,38 +2368,38 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
} }
/** /**
* cgroup_taskset_migrate - migrate a taskset to a cgroup * cgroup_taskset_migrate - migrate a taskset
* @tset: taget taskset * @tset: taget taskset
* @dst_cgrp: destination cgroup * @root: cgroup root the migration is taking place on
* *
* Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the * Migrate tasks in @tset as setup by migration preparation functions.
* ->can_attach callbacks fails and guarantees that either all or none of * This function fails iff one of the ->can_attach callbacks fails and
* the tasks in @tset are migrated. @tset is consumed regardless of * guarantees that either all or none of the tasks in @tset are migrated.
* success. * @tset is consumed regardless of success.
*/ */
static int cgroup_taskset_migrate(struct cgroup_taskset *tset, static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
struct cgroup *dst_cgrp) struct cgroup_root *root)
{ {
struct cgroup_subsys_state *css, *failed_css = NULL; struct cgroup_subsys *ss;
struct task_struct *task, *tmp_task; struct task_struct *task, *tmp_task;
struct css_set *cset, *tmp_cset; struct css_set *cset, *tmp_cset;
int i, ret; int ssid, failed_ssid, ret;
/* methods shouldn't be called if no task is actually migrating */ /* methods shouldn't be called if no task is actually migrating */
if (list_empty(&tset->src_csets)) if (list_empty(&tset->src_csets))
return 0; return 0;
/* check that we can legitimately attach to the cgroup */ /* check that we can legitimately attach to the cgroup */
for_each_e_css(css, i, dst_cgrp) { do_each_subsys_mask(ss, ssid, root->subsys_mask) {
if (css->ss->can_attach) { if (ss->can_attach) {
tset->ssid = i; tset->ssid = ssid;
ret = css->ss->can_attach(tset); ret = ss->can_attach(tset);
if (ret) { if (ret) {
failed_css = css; failed_ssid = ssid;
goto out_cancel_attach; goto out_cancel_attach;
} }
} }
} } while_each_subsys_mask();
/* /*
* Now that we're guaranteed success, proceed to move all tasks to * Now that we're guaranteed success, proceed to move all tasks to
...@@ -2396,25 +2426,25 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, ...@@ -2396,25 +2426,25 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
*/ */
tset->csets = &tset->dst_csets; tset->csets = &tset->dst_csets;
for_each_e_css(css, i, dst_cgrp) { do_each_subsys_mask(ss, ssid, root->subsys_mask) {
if (css->ss->attach) { if (ss->attach) {
tset->ssid = i; tset->ssid = ssid;
css->ss->attach(tset); ss->attach(tset);
}
} }
} while_each_subsys_mask();
ret = 0; ret = 0;
goto out_release_tset; goto out_release_tset;
out_cancel_attach: out_cancel_attach:
for_each_e_css(css, i, dst_cgrp) { do_each_subsys_mask(ss, ssid, root->subsys_mask) {
if (css == failed_css) if (ssid == failed_ssid)
break; break;
if (css->ss->cancel_attach) { if (ss->cancel_attach) {
tset->ssid = i; tset->ssid = ssid;
css->ss->cancel_attach(tset); ss->cancel_attach(tset);
}
} }
} while_each_subsys_mask();
out_release_tset: out_release_tset:
spin_lock_bh(&css_set_lock); spin_lock_bh(&css_set_lock);
list_splice_init(&tset->dst_csets, &tset->src_csets); list_splice_init(&tset->dst_csets, &tset->src_csets);
...@@ -2426,6 +2456,20 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, ...@@ -2426,6 +2456,20 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
return ret; return ret;
} }
/**
* cgroup_may_migrate_to - verify whether a cgroup can be migration destination
* @dst_cgrp: destination cgroup to test
*
* On the default hierarchy, except for the root, subtree_control must be
* zero for migration destination cgroups with tasks so that child cgroups
* don't compete against tasks.
*/
static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
{
return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
!dst_cgrp->subtree_control;
}
/** /**
* cgroup_migrate_finish - cleanup after attach * cgroup_migrate_finish - cleanup after attach
* @preloaded_csets: list of preloaded css_sets * @preloaded_csets: list of preloaded css_sets
...@@ -2442,6 +2486,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) ...@@ -2442,6 +2486,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
spin_lock_bh(&css_set_lock); spin_lock_bh(&css_set_lock);
list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
cset->mg_src_cgrp = NULL; cset->mg_src_cgrp = NULL;
cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL; cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node); list_del_init(&cset->mg_preload_node);
put_css_set_locked(cset); put_css_set_locked(cset);
...@@ -2474,58 +2519,56 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, ...@@ -2474,58 +2519,56 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock); lockdep_assert_held(&css_set_lock);
/*
* If ->dead, @src_set is associated with one or more dead cgroups
* and doesn't contain any migratable tasks. Ignore it early so
* that the rest of migration path doesn't get confused by it.
*/
if (src_cset->dead)
return;
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
if (!list_empty(&src_cset->mg_preload_node)) if (!list_empty(&src_cset->mg_preload_node))
return; return;
WARN_ON(src_cset->mg_src_cgrp); WARN_ON(src_cset->mg_src_cgrp);
WARN_ON(src_cset->mg_dst_cgrp);
WARN_ON(!list_empty(&src_cset->mg_tasks)); WARN_ON(!list_empty(&src_cset->mg_tasks));
WARN_ON(!list_empty(&src_cset->mg_node)); WARN_ON(!list_empty(&src_cset->mg_node));
src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_src_cgrp = src_cgrp;
src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset); get_css_set(src_cset);
list_add(&src_cset->mg_preload_node, preloaded_csets); list_add(&src_cset->mg_preload_node, preloaded_csets);
} }
/** /**
* cgroup_migrate_prepare_dst - prepare destination css_sets for migration * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
* @dst_cgrp: the destination cgroup (may be %NULL)
* @preloaded_csets: list of preloaded source css_sets * @preloaded_csets: list of preloaded source css_sets
* *
* Tasks are about to be moved to @dst_cgrp and all the source css_sets * Tasks are about to be moved and all the source css_sets have been
* have been preloaded to @preloaded_csets. This function looks up and * preloaded to @preloaded_csets. This function looks up and pins all
* pins all destination css_sets, links each to its source, and append them * destination css_sets, links each to its source, and append them to
* to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each * @preloaded_csets.
* source css_set is assumed to be its cgroup on the default hierarchy.
* *
* This function must be called after cgroup_migrate_add_src() has been * This function must be called after cgroup_migrate_add_src() has been
* called on each migration source css_set. After migration is performed * called on each migration source css_set. After migration is performed
* using cgroup_migrate(), cgroup_migrate_finish() must be called on * using cgroup_migrate(), cgroup_migrate_finish() must be called on
* @preloaded_csets. * @preloaded_csets.
*/ */
static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
struct list_head *preloaded_csets)
{ {
LIST_HEAD(csets); LIST_HEAD(csets);
struct css_set *src_cset, *tmp_cset; struct css_set *src_cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_mutex);
/*
* Except for the root, child_subsys_mask must be zero for a cgroup
* with tasks so that child cgroups don't compete against tasks.
*/
if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
dst_cgrp->child_subsys_mask)
return -EBUSY;
/* look up the dst cset for each src cset and link it to src */ /* look up the dst cset for each src cset and link it to src */
list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
struct css_set *dst_cset; struct css_set *dst_cset;
dst_cset = find_css_set(src_cset, dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
dst_cgrp ?: src_cset->dfl_cgrp);
if (!dst_cset) if (!dst_cset)
goto err; goto err;
...@@ -2538,6 +2581,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, ...@@ -2538,6 +2581,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
*/ */
if (src_cset == dst_cset) { if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL; src_cset->mg_src_cgrp = NULL;
src_cset->mg_dst_cgrp = NULL;
list_del_init(&src_cset->mg_preload_node); list_del_init(&src_cset->mg_preload_node);
put_css_set(src_cset); put_css_set(src_cset);
put_css_set(dst_cset); put_css_set(dst_cset);
...@@ -2563,11 +2607,11 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, ...@@ -2563,11 +2607,11 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
* cgroup_migrate - migrate a process or task to a cgroup * cgroup_migrate - migrate a process or task to a cgroup
* @leader: the leader of the process or the task to migrate * @leader: the leader of the process or the task to migrate
* @threadgroup: whether @leader points to the whole process or a single task * @threadgroup: whether @leader points to the whole process or a single task
* @cgrp: the destination cgroup * @root: cgroup root migration is taking place on
* *
* Migrate a process or task denoted by @leader to @cgrp. If migrating a * Migrate a process or task denoted by @leader. If migrating a process,
* process, the caller must be holding cgroup_threadgroup_rwsem. The * the caller must be holding cgroup_threadgroup_rwsem. The caller is also
* caller is also responsible for invoking cgroup_migrate_add_src() and * responsible for invoking cgroup_migrate_add_src() and
* cgroup_migrate_prepare_dst() on the targets before invoking this * cgroup_migrate_prepare_dst() on the targets before invoking this
* function and following up with cgroup_migrate_finish(). * function and following up with cgroup_migrate_finish().
* *
...@@ -2578,7 +2622,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, ...@@ -2578,7 +2622,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
* actually starting migrating. * actually starting migrating.
*/ */
static int cgroup_migrate(struct task_struct *leader, bool threadgroup, static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
struct cgroup *cgrp) struct cgroup_root *root)
{ {
struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
struct task_struct *task; struct task_struct *task;
...@@ -2599,7 +2643,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, ...@@ -2599,7 +2643,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
rcu_read_unlock(); rcu_read_unlock();
spin_unlock_bh(&css_set_lock); spin_unlock_bh(&css_set_lock);
return cgroup_taskset_migrate(&tset, cgrp); return cgroup_taskset_migrate(&tset, root);
} }
/** /**
...@@ -2617,6 +2661,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, ...@@ -2617,6 +2661,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
struct task_struct *task; struct task_struct *task;
int ret; int ret;
if (!cgroup_may_migrate_to(dst_cgrp))
return -EBUSY;
/* look up all src csets */ /* look up all src csets */
spin_lock_bh(&css_set_lock); spin_lock_bh(&css_set_lock);
rcu_read_lock(); rcu_read_lock();
...@@ -2631,9 +2678,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, ...@@ -2631,9 +2678,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
spin_unlock_bh(&css_set_lock); spin_unlock_bh(&css_set_lock);
/* prepare dst csets and commit */ /* prepare dst csets and commit */
ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); ret = cgroup_migrate_prepare_dst(&preloaded_csets);
if (!ret) if (!ret)
ret = cgroup_migrate(leader, threadgroup, dst_cgrp); ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
cgroup_migrate_finish(&preloaded_csets); cgroup_migrate_finish(&preloaded_csets);
return ret; return ret;
...@@ -2696,7 +2743,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, ...@@ -2696,7 +2743,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return -EINVAL; return -EINVAL;
cgrp = cgroup_kn_lock_live(of->kn); cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp) if (!cgrp)
return -ENODEV; return -ENODEV;
...@@ -2794,7 +2841,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, ...@@ -2794,7 +2841,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
cgrp = cgroup_kn_lock_live(of->kn); cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp) if (!cgrp)
return -ENODEV; return -ENODEV;
spin_lock(&release_agent_path_lock); spin_lock(&release_agent_path_lock);
...@@ -2822,38 +2869,28 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) ...@@ -2822,38 +2869,28 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
return 0; return 0;
} }
static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask) static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
{ {
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
bool printed = false; bool printed = false;
int ssid; int ssid;
for_each_subsys_which(ss, ssid, &ss_mask) { do_each_subsys_mask(ss, ssid, ss_mask) {
if (printed) if (printed)
seq_putc(seq, ' '); seq_putc(seq, ' ');
seq_printf(seq, "%s", ss->name); seq_printf(seq, "%s", ss->name);
printed = true; printed = true;
} } while_each_subsys_mask();
if (printed) if (printed)
seq_putc(seq, '\n'); seq_putc(seq, '\n');
} }
/* show controllers which are currently attached to the default hierarchy */
static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
~cgrp_dfl_root_inhibit_ss_mask);
return 0;
}
/* show controllers which are enabled from the parent */ /* show controllers which are enabled from the parent */
static int cgroup_controllers_show(struct seq_file *seq, void *v) static int cgroup_controllers_show(struct seq_file *seq, void *v)
{ {
struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup *cgrp = seq_css(seq)->cgroup;
cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control); cgroup_print_ss_mask(seq, cgroup_control(cgrp));
return 0; return 0;
} }
...@@ -2870,16 +2907,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) ...@@ -2870,16 +2907,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
* cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
* @cgrp: root of the subtree to update csses for * @cgrp: root of the subtree to update csses for
* *
* @cgrp's child_subsys_mask has changed and its subtree's (self excluded) * @cgrp's control masks have changed and its subtree's css associations
* css associations need to be updated accordingly. This function looks up * need to be updated accordingly. This function looks up all css_sets
* all css_sets which are attached to the subtree, creates the matching * which are attached to the subtree, creates the matching updated css_sets
* updated css_sets and migrates the tasks to the new ones. * and migrates the tasks to the new ones.
*/ */
static int cgroup_update_dfl_csses(struct cgroup *cgrp) static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{ {
LIST_HEAD(preloaded_csets); LIST_HEAD(preloaded_csets);
struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
struct cgroup_subsys_state *css; struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset; struct css_set *src_cset;
int ret; int ret;
...@@ -2889,21 +2927,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ...@@ -2889,21 +2927,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
/* look up all csses currently attached to @cgrp's subtree */ /* look up all csses currently attached to @cgrp's subtree */
spin_lock_bh(&css_set_lock); spin_lock_bh(&css_set_lock);
css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
struct cgrp_cset_link *link; struct cgrp_cset_link *link;
/* self is not affected by child_subsys_mask change */ list_for_each_entry(link, &dsct->cset_links, cset_link)
if (css->cgroup == cgrp) cgroup_migrate_add_src(link->cset, dsct,
continue;
list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, cgrp,
&preloaded_csets); &preloaded_csets);
} }
spin_unlock_bh(&css_set_lock); spin_unlock_bh(&css_set_lock);
/* NULL dst indicates self on default hierarchy */ /* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); ret = cgroup_migrate_prepare_dst(&preloaded_csets);
if (ret) if (ret)
goto out_finish; goto out_finish;
...@@ -2915,17 +2949,270 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ...@@ -2915,17 +2949,270 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
if (!src_cset->mg_src_cgrp) if (!src_cset->mg_src_cgrp)
break; break;
/* all tasks in src_csets need to be migrated */ /* all tasks in src_csets need to be migrated */
list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
cgroup_taskset_add(task, &tset); cgroup_taskset_add(task, &tset);
}
spin_unlock_bh(&css_set_lock);
ret = cgroup_taskset_migrate(&tset, cgrp->root);
out_finish:
cgroup_migrate_finish(&preloaded_csets);
percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
}
/**
* cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
* @cgrp: root of the target subtree
*
* Because css offlining is asynchronous, userland may try to re-enable a
* controller while the previous css is still around. This function grabs
* cgroup_mutex and drains the previous css instances of @cgrp's subtree.
*/
static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
__acquires(&cgroup_mutex)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
struct cgroup_subsys *ss;
int ssid;
restart:
mutex_lock(&cgroup_mutex);
cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
DEFINE_WAIT(wait);
if (!css || !percpu_ref_is_dying(&css->refcnt))
continue;
cgroup_get(dsct);
prepare_to_wait(&dsct->offline_waitq, &wait,
TASK_UNINTERRUPTIBLE);
mutex_unlock(&cgroup_mutex);
schedule();
finish_wait(&dsct->offline_waitq, &wait);
cgroup_put(dsct);
goto restart;
}
}
}
/**
* cgroup_save_control - save control masks of a subtree
* @cgrp: root of the target subtree
*
* Save ->subtree_control and ->subtree_ss_mask to the respective old_
* prefixed fields for @cgrp's subtree including @cgrp itself.
*/
static void cgroup_save_control(struct cgroup *cgrp)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
dsct->old_subtree_control = dsct->subtree_control;
dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
}
}
/**
* cgroup_propagate_control - refresh control masks of a subtree
* @cgrp: root of the target subtree
*
* For @cgrp and its subtree, ensure ->subtree_ss_mask matches
* ->subtree_control and propagate controller availability through the
* subtree so that descendants don't have unavailable controllers enabled.
*/
static void cgroup_propagate_control(struct cgroup *cgrp)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
dsct->subtree_control &= cgroup_control(dsct);
dsct->subtree_ss_mask =
cgroup_calc_subtree_ss_mask(dsct->subtree_control,
cgroup_ss_mask(dsct));
}
}
/**
* cgroup_restore_control - restore control masks of a subtree
* @cgrp: root of the target subtree
*
* Restore ->subtree_control and ->subtree_ss_mask from the respective old_
* prefixed fields for @cgrp's subtree including @cgrp itself.
*/
static void cgroup_restore_control(struct cgroup *cgrp)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
dsct->subtree_control = dsct->old_subtree_control;
dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
}
}
static bool css_visible(struct cgroup_subsys_state *css)
{
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
if (cgroup_control(cgrp) & (1 << ss->id))
return true;
if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
return false;
return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
}
/**
* cgroup_apply_control_enable - enable or show csses according to control
* @cgrp: root of the target subtree
*
* Walk @cgrp's subtree and create new csses or make the existing ones
* visible. A css is created invisible if it's being implicitly enabled
* through dependency. An invisible css is made visible when the userland
* explicitly enables it.
*
* Returns 0 on success, -errno on failure. On failure, csses which have
* been processed already aren't cleaned up. The caller is responsible for
* cleaning up with cgroup_apply_control_disble().
*/
static int cgroup_apply_control_enable(struct cgroup *cgrp)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
struct cgroup_subsys *ss;
int ssid, ret;
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
continue;
if (!css) {
css = css_create(dsct, ss);
if (IS_ERR(css))
return PTR_ERR(css);
}
if (css_visible(css)) {
ret = css_populate_dir(css);
if (ret)
return ret;
}
}
}
return 0;
}
/**
* cgroup_apply_control_disable - kill or hide csses according to control
* @cgrp: root of the target subtree
*
* Walk @cgrp's subtree and kill and hide csses so that they match
* cgroup_ss_mask() and cgroup_visible_mask().
*
* A css is hidden when the userland requests it to be disabled while other
* subsystems are still depending on it. The css must not actively control
* resources and be in the vanilla state if it's made visible again later.
* Controllers which may be depended upon should provide ->css_reset() for
* this purpose.
*/
static void cgroup_apply_control_disable(struct cgroup *cgrp)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
struct cgroup_subsys *ss;
int ssid;
cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
if (!css)
continue;
if (css->parent &&
!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
kill_css(css);
} else if (!css_visible(css)) {
css_clear_dir(css);
if (ss->css_reset)
ss->css_reset(css);
}
}
}
}
/**
* cgroup_apply_control - apply control mask updates to the subtree
* @cgrp: root of the target subtree
*
* subsystems can be enabled and disabled in a subtree using the following
* steps.
*
* 1. Call cgroup_save_control() to stash the current state.
* 2. Update ->subtree_control masks in the subtree as desired.
* 3. Call cgroup_apply_control() to apply the changes.
* 4. Optionally perform other related operations.
* 5. Call cgroup_finalize_control() to finish up.
*
* This function implements step 3 and propagates the mask changes
* throughout @cgrp's subtree, updates csses accordingly and perform
* process migrations.
*/
static int cgroup_apply_control(struct cgroup *cgrp)
{
int ret;
cgroup_propagate_control(cgrp);
ret = cgroup_apply_control_enable(cgrp);
if (ret)
return ret;
/*
* At this point, cgroup_e_css() results reflect the new csses
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
ret = cgroup_update_dfl_csses(cgrp);
if (ret)
return ret;
return 0;
}
/**
* cgroup_finalize_control - finalize control mask update
* @cgrp: root of the target subtree
* @ret: the result of the update
*
* Finalize control mask update. See cgroup_apply_control() for more info.
*/
static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
{
if (ret) {
cgroup_restore_control(cgrp);
cgroup_propagate_control(cgrp);
} }
spin_unlock_bh(&css_set_lock);
ret = cgroup_taskset_migrate(&tset, cgrp); cgroup_apply_control_disable(cgrp);
out_finish:
cgroup_migrate_finish(&preloaded_csets);
percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
} }
/* change the enabled child controllers for a cgroup in the default hierarchy */ /* change the enabled child controllers for a cgroup in the default hierarchy */
...@@ -2933,8 +3220,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, ...@@ -2933,8 +3220,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, char *buf, size_t nbytes,
loff_t off) loff_t off)
{ {
unsigned long enable = 0, disable = 0; u16 enable = 0, disable = 0;
unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
struct cgroup *cgrp, *child; struct cgroup *cgrp, *child;
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
char *tok; char *tok;
...@@ -2946,11 +3232,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, ...@@ -2946,11 +3232,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
*/ */
buf = strstrip(buf); buf = strstrip(buf);
while ((tok = strsep(&buf, " "))) { while ((tok = strsep(&buf, " "))) {
unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask;
if (tok[0] == '\0') if (tok[0] == '\0')
continue; continue;
for_each_subsys_which(ss, ssid, &tmp_ss_mask) { do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
if (!cgroup_ssid_enabled(ssid) || if (!cgroup_ssid_enabled(ssid) ||
strcmp(tok + 1, ss->name)) strcmp(tok + 1, ss->name))
continue; continue;
...@@ -2965,12 +3249,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, ...@@ -2965,12 +3249,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
return -EINVAL; return -EINVAL;
} }
break; break;
} } while_each_subsys_mask();
if (ssid == CGROUP_SUBSYS_COUNT) if (ssid == CGROUP_SUBSYS_COUNT)
return -EINVAL; return -EINVAL;
} }
cgrp = cgroup_kn_lock_live(of->kn); cgrp = cgroup_kn_lock_live(of->kn, true);
if (!cgrp) if (!cgrp)
return -ENODEV; return -ENODEV;
...@@ -2981,10 +3265,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, ...@@ -2981,10 +3265,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
continue; continue;
} }
/* unavailable or not enabled on the parent? */ if (!(cgroup_control(cgrp) & (1 << ssid))) {
if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
(cgroup_parent(cgrp) &&
!(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
ret = -ENOENT; ret = -ENOENT;
goto out_unlock; goto out_unlock;
} }
...@@ -3018,150 +3299,21 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, ...@@ -3018,150 +3299,21 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
goto out_unlock; goto out_unlock;
} }
/* /* save and update control masks and prepare csses */
* Update subsys masks and calculate what needs to be done. More cgroup_save_control(cgrp);
* subsystems than specified may need to be enabled or disabled
* depending on subsystem dependencies.
*/
old_sc = cgrp->subtree_control;
old_ss = cgrp->child_subsys_mask;
new_sc = (old_sc | enable) & ~disable;
new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
css_enable = ~old_ss & new_ss;
css_disable = old_ss & ~new_ss;
enable |= css_enable;
disable |= css_disable;
/*
* Because css offlining is asynchronous, userland might try to
* re-enable the same controller while the previous instance is
* still around. In such cases, wait till it's gone using
* offline_waitq.
*/
for_each_subsys_which(ss, ssid, &css_enable) {
cgroup_for_each_live_child(child, cgrp) {
DEFINE_WAIT(wait);
if (!cgroup_css(child, ss))
continue;
cgroup_get(child);
prepare_to_wait(&child->offline_waitq, &wait,
TASK_UNINTERRUPTIBLE);
cgroup_kn_unlock(of->kn);
schedule();
finish_wait(&child->offline_waitq, &wait);
cgroup_put(child);
return restart_syscall();
}
}
cgrp->subtree_control = new_sc;
cgrp->child_subsys_mask = new_ss;
/*
* Create new csses or make the existing ones visible. A css is
* created invisible if it's being implicitly enabled through
* dependency. An invisible css is made visible when the userland
* explicitly enables it.
*/
for_each_subsys(ss, ssid) {
if (!(enable & (1 << ssid)))
continue;
cgroup_for_each_live_child(child, cgrp) {
if (css_enable & (1 << ssid))
ret = create_css(child, ss,
cgrp->subtree_control & (1 << ssid));
else
ret = css_populate_dir(cgroup_css(child, ss),
NULL);
if (ret)
goto err_undo_css;
}
}
/*
* At this point, cgroup_e_css() results reflect the new csses
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
ret = cgroup_update_dfl_csses(cgrp);
if (ret)
goto err_undo_css;
/*
* All tasks are migrated out of disabled csses. Kill or hide
* them. A css is hidden when the userland requests it to be
* disabled while other subsystems are still depending on it. The
* css must not actively control resources and be in the vanilla
* state if it's made visible again later. Controllers which may
* be depended upon should provide ->css_reset() for this purpose.
*/
for_each_subsys(ss, ssid) {
if (!(disable & (1 << ssid)))
continue;
cgroup_for_each_live_child(child, cgrp) {
struct cgroup_subsys_state *css = cgroup_css(child, ss);
if (css_disable & (1 << ssid)) {
kill_css(css);
} else {
css_clear_dir(css, NULL);
if (ss->css_reset)
ss->css_reset(css);
}
}
}
/* cgrp->subtree_control |= enable;
* The effective csses of all the descendants (excluding @cgrp) may cgrp->subtree_control &= ~disable;
* have changed. Subsystems can optionally subscribe to this event
* by implementing ->css_e_css_changed() which is invoked if any of
* the effective csses seen from the css's cgroup may have changed.
*/
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
struct cgroup_subsys_state *css;
if (!ss->css_e_css_changed || !this_css) ret = cgroup_apply_control(cgrp);
continue;
css_for_each_descendant_pre(css, this_css) cgroup_finalize_control(cgrp, ret);
if (css != this_css)
ss->css_e_css_changed(css);
}
kernfs_activate(cgrp->kn); kernfs_activate(cgrp->kn);
ret = 0; ret = 0;
out_unlock: out_unlock:
cgroup_kn_unlock(of->kn); cgroup_kn_unlock(of->kn);
return ret ?: nbytes; return ret ?: nbytes;
err_undo_css:
cgrp->subtree_control = old_sc;
cgrp->child_subsys_mask = old_ss;
for_each_subsys(ss, ssid) {
if (!(enable & (1 << ssid)))
continue;
cgroup_for_each_live_child(child, cgrp) {
struct cgroup_subsys_state *css = cgroup_css(child, ss);
if (!css)
continue;
if (css_enable & (1 << ssid))
kill_css(css);
else
css_clear_dir(css, NULL);
}
}
goto out_unlock;
} }
static int cgroup_events_show(struct seq_file *seq, void *v) static int cgroup_events_show(struct seq_file *seq, void *v)
...@@ -3359,7 +3511,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, ...@@ -3359,7 +3511,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
bool is_add) bool is_add)
{ {
struct cftype *cft, *cft_end = NULL; struct cftype *cft, *cft_end = NULL;
int ret; int ret = 0;
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_mutex);
...@@ -3388,7 +3540,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, ...@@ -3388,7 +3540,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
cgroup_rm_file(cgrp, cft); cgroup_rm_file(cgrp, cft);
} }
} }
return 0; return ret;
} }
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
...@@ -3405,7 +3557,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) ...@@ -3405,7 +3557,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
css_for_each_descendant_pre(css, cgroup_css(root, ss)) { css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
struct cgroup *cgrp = css->cgroup; struct cgroup *cgrp = css->cgroup;
if (cgroup_is_dead(cgrp)) if (!(css->flags & CSS_VISIBLE))
continue; continue;
ret = cgroup_addrm_files(css, cgrp, cfts, is_add); ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
...@@ -4026,6 +4178,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) ...@@ -4026,6 +4178,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
struct task_struct *task; struct task_struct *task;
int ret; int ret;
if (!cgroup_may_migrate_to(to))
return -EBUSY;
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
/* all tasks in @from are being moved, all csets are source */ /* all tasks in @from are being moved, all csets are source */
...@@ -4034,7 +4189,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) ...@@ -4034,7 +4189,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
cgroup_migrate_add_src(link->cset, to, &preloaded_csets); cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
spin_unlock_bh(&css_set_lock); spin_unlock_bh(&css_set_lock);
ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); ret = cgroup_migrate_prepare_dst(&preloaded_csets);
if (ret) if (ret)
goto out_err; goto out_err;
...@@ -4050,7 +4205,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) ...@@ -4050,7 +4205,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
css_task_iter_end(&it); css_task_iter_end(&it);
if (task) { if (task) {
ret = cgroup_migrate(task, false, to); ret = cgroup_migrate(task, false, to->root);
put_task_struct(task); put_task_struct(task);
} }
} while (task && !ret); } while (task && !ret);
...@@ -4557,12 +4712,6 @@ static struct cftype cgroup_dfl_base_files[] = { ...@@ -4557,12 +4712,6 @@ static struct cftype cgroup_dfl_base_files[] = {
}, },
{ {
.name = "cgroup.controllers", .name = "cgroup.controllers",
.flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_root_controllers_show,
},
{
.name = "cgroup.controllers",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_controllers_show, .seq_show = cgroup_controllers_show,
}, },
{ {
...@@ -4731,7 +4880,9 @@ static void css_release_work_fn(struct work_struct *work) ...@@ -4731,7 +4880,9 @@ static void css_release_work_fn(struct work_struct *work)
* Those are supported by RCU protecting clearing of * Those are supported by RCU protecting clearing of
* cgrp->kn->priv backpointer. * cgrp->kn->priv backpointer.
*/ */
RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); if (cgrp->kn)
RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
NULL);
} }
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
...@@ -4802,6 +4953,9 @@ static void offline_css(struct cgroup_subsys_state *css) ...@@ -4802,6 +4953,9 @@ static void offline_css(struct cgroup_subsys_state *css)
if (!(css->flags & CSS_ONLINE)) if (!(css->flags & CSS_ONLINE))
return; return;
if (ss->css_reset)
ss->css_reset(css);
if (ss->css_offline) if (ss->css_offline)
ss->css_offline(css); ss->css_offline(css);
...@@ -4812,17 +4966,16 @@ static void offline_css(struct cgroup_subsys_state *css) ...@@ -4812,17 +4966,16 @@ static void offline_css(struct cgroup_subsys_state *css)
} }
/** /**
* create_css - create a cgroup_subsys_state * css_create - create a cgroup_subsys_state
* @cgrp: the cgroup new css will be associated with * @cgrp: the cgroup new css will be associated with
* @ss: the subsys of new css * @ss: the subsys of new css
* @visible: whether to create control knobs for the new css or not
* *
* Create a new css associated with @cgrp - @ss pair. On success, the new * Create a new css associated with @cgrp - @ss pair. On success, the new
* css is online and installed in @cgrp with all interface files created if * css is online and installed in @cgrp. This function doesn't create the
* @visible. Returns 0 on success, -errno on failure. * interface files. Returns 0 on success, -errno on failure.
*/ */
static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
bool visible) struct cgroup_subsys *ss)
{ {
struct cgroup *parent = cgroup_parent(cgrp); struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
...@@ -4833,7 +4986,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, ...@@ -4833,7 +4986,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
css = ss->css_alloc(parent_css); css = ss->css_alloc(parent_css);
if (IS_ERR(css)) if (IS_ERR(css))
return PTR_ERR(css); return css;
init_and_link_css(css, ss, cgrp); init_and_link_css(css, ss, cgrp);
...@@ -4846,12 +4999,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, ...@@ -4846,12 +4999,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
goto err_free_percpu_ref; goto err_free_percpu_ref;
css->id = err; css->id = err;
if (visible) {
err = css_populate_dir(css, NULL);
if (err)
goto err_free_id;
}
/* @css is ready to be brought online now, make it visible */ /* @css is ready to be brought online now, make it visible */
list_add_tail_rcu(&css->sibling, &parent_css->children); list_add_tail_rcu(&css->sibling, &parent_css->children);
cgroup_idr_replace(&ss->css_idr, css, css->id); cgroup_idr_replace(&ss->css_idr, css, css->id);
...@@ -4869,47 +5016,30 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, ...@@ -4869,47 +5016,30 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
ss->warned_broken_hierarchy = true; ss->warned_broken_hierarchy = true;
} }
return 0; return css;
err_list_del: err_list_del:
list_del_rcu(&css->sibling); list_del_rcu(&css->sibling);
css_clear_dir(css, NULL);
err_free_id:
cgroup_idr_remove(&ss->css_idr, css->id); cgroup_idr_remove(&ss->css_idr, css->id);
err_free_percpu_ref: err_free_percpu_ref:
percpu_ref_exit(&css->refcnt); percpu_ref_exit(&css->refcnt);
err_free_css: err_free_css:
call_rcu(&css->rcu_head, css_free_rcu_fn); call_rcu(&css->rcu_head, css_free_rcu_fn);
return err; return ERR_PTR(err);
} }
static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, static struct cgroup *cgroup_create(struct cgroup *parent)
umode_t mode)
{ {
struct cgroup *parent, *cgrp, *tcgrp; struct cgroup_root *root = parent->root;
struct cgroup_root *root; struct cgroup *cgrp, *tcgrp;
struct cgroup_subsys *ss; int level = parent->level + 1;
struct kernfs_node *kn; int ret;
int level, ssid, ret;
/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
*/
if (strchr(name, '\n'))
return -EINVAL;
parent = cgroup_kn_lock_live(parent_kn);
if (!parent)
return -ENODEV;
root = parent->root;
level = parent->level + 1;
/* allocate the cgroup and its ID, 0 is reserved for the root */ /* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(sizeof(*cgrp) + cgrp = kzalloc(sizeof(*cgrp) +
sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL); sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
if (!cgrp) { if (!cgrp)
ret = -ENOMEM; return ERR_PTR(-ENOMEM);
goto out_unlock;
}
ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
if (ret) if (ret)
...@@ -4940,20 +5070,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, ...@@ -4940,20 +5070,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
/* create the directory */
kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
goto out_free_id;
}
cgrp->kn = kn;
/*
* This extra ref will be put in cgroup_free_fn() and guarantees
* that @cgrp->kn is always accessible.
*/
kernfs_get(kn);
cgrp->self.serial_nr = css_serial_nr_next++; cgrp->self.serial_nr = css_serial_nr_next++;
/* allocation complete, commit to creation */ /* allocation complete, commit to creation */
...@@ -4967,51 +5083,90 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, ...@@ -4967,51 +5083,90 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
*/ */
cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
ret = cgroup_kn_set_ugid(kn); /*
if (ret) * On the default hierarchy, a child doesn't automatically inherit
goto out_destroy; * subtree_control from the parent. Each is configured manually.
*/
if (!cgroup_on_dfl(cgrp))
cgrp->subtree_control = cgroup_control(cgrp);
ret = css_populate_dir(&cgrp->self, NULL); cgroup_propagate_control(cgrp);
if (ret)
goto out_destroy;
/* let's create and online css's */ /* @cgrp doesn't have dir yet so the following will only create csses */
for_each_subsys(ss, ssid) { ret = cgroup_apply_control_enable(cgrp);
if (parent->child_subsys_mask & (1 << ssid)) {
ret = create_css(cgrp, ss,
parent->subtree_control & (1 << ssid));
if (ret) if (ret)
goto out_destroy; goto out_destroy;
return cgrp;
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
kfree(cgrp);
return ERR_PTR(ret);
out_destroy:
cgroup_destroy_locked(cgrp);
return ERR_PTR(ret);
}
static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
umode_t mode)
{
struct cgroup *parent, *cgrp;
struct kernfs_node *kn;
int ret;
/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
if (strchr(name, '\n'))
return -EINVAL;
parent = cgroup_kn_lock_live(parent_kn, false);
if (!parent)
return -ENODEV;
cgrp = cgroup_create(parent);
if (IS_ERR(cgrp)) {
ret = PTR_ERR(cgrp);
goto out_unlock;
} }
/* create the directory */
kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
goto out_destroy;
} }
cgrp->kn = kn;
/* /*
* On the default hierarchy, a child doesn't automatically inherit * This extra ref will be put in cgroup_free_fn() and guarantees
* subtree_control from the parent. Each is configured manually. * that @cgrp->kn is always accessible.
*/ */
if (!cgroup_on_dfl(cgrp)) { kernfs_get(kn);
cgrp->subtree_control = parent->subtree_control;
cgroup_refresh_child_subsys_mask(cgrp); ret = cgroup_kn_set_ugid(kn);
} if (ret)
goto out_destroy;
ret = css_populate_dir(&cgrp->self);
if (ret)
goto out_destroy;
ret = cgroup_apply_control_enable(cgrp);
if (ret)
goto out_destroy;
/* let's create and online css's */
kernfs_activate(kn); kernfs_activate(kn);
ret = 0; ret = 0;
goto out_unlock; goto out_unlock;
out_free_id: out_destroy:
cgroup_idr_remove(&root->cgroup_idr, cgrp->id); cgroup_destroy_locked(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
kfree(cgrp);
out_unlock: out_unlock:
cgroup_kn_unlock(parent_kn); cgroup_kn_unlock(parent_kn);
return ret; return ret;
out_destroy:
cgroup_destroy_locked(cgrp);
goto out_unlock;
} }
/* /*
...@@ -5065,7 +5220,7 @@ static void kill_css(struct cgroup_subsys_state *css) ...@@ -5065,7 +5220,7 @@ static void kill_css(struct cgroup_subsys_state *css)
* This must happen before css is disassociated with its cgroup. * This must happen before css is disassociated with its cgroup.
* See seq_css() for details. * See seq_css() for details.
*/ */
css_clear_dir(css, NULL); css_clear_dir(css);
/* /*
* Killing would put the base ref, but we need to keep it alive * Killing would put the base ref, but we need to keep it alive
...@@ -5114,6 +5269,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) ...@@ -5114,6 +5269,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex) __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{ {
struct cgroup_subsys_state *css; struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
int ssid; int ssid;
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_mutex);
...@@ -5134,11 +5290,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) ...@@ -5134,11 +5290,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
return -EBUSY; return -EBUSY;
/* /*
* Mark @cgrp dead. This prevents further task migration and child * Mark @cgrp and the associated csets dead. The former prevents
* creation by disabling cgroup_lock_live_group(). * further task migration and child creation by disabling
* cgroup_lock_live_group(). The latter makes the csets ignored by
* the migration path.
*/ */
cgrp->self.flags &= ~CSS_ONLINE; cgrp->self.flags &= ~CSS_ONLINE;
spin_lock_bh(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
link->cset->dead = true;
spin_unlock_bh(&css_set_lock);
/* initiate massacre of all css's */ /* initiate massacre of all css's */
for_each_css(css, ssid, cgrp) for_each_css(css, ssid, cgrp)
kill_css(css); kill_css(css);
...@@ -5162,7 +5325,7 @@ static int cgroup_rmdir(struct kernfs_node *kn) ...@@ -5162,7 +5325,7 @@ static int cgroup_rmdir(struct kernfs_node *kn)
struct cgroup *cgrp; struct cgroup *cgrp;
int ret = 0; int ret = 0;
cgrp = cgroup_kn_lock_live(kn); cgrp = cgroup_kn_lock_live(kn, false);
if (!cgrp) if (!cgrp)
return 0; return 0;
...@@ -5252,7 +5415,7 @@ int __init cgroup_init_early(void) ...@@ -5252,7 +5415,7 @@ int __init cgroup_init_early(void)
for_each_subsys(ss, i) { for_each_subsys(ss, i) {
WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
"invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
ss->id, ss->name); ss->id, ss->name);
WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
...@@ -5269,7 +5432,7 @@ int __init cgroup_init_early(void) ...@@ -5269,7 +5432,7 @@ int __init cgroup_init_early(void)
return 0; return 0;
} }
static unsigned long cgroup_disable_mask __initdata; static u16 cgroup_disable_mask __initdata;
/** /**
* cgroup_init - cgroup initialization * cgroup_init - cgroup initialization
...@@ -5280,18 +5443,21 @@ static unsigned long cgroup_disable_mask __initdata; ...@@ -5280,18 +5443,21 @@ static unsigned long cgroup_disable_mask __initdata;
int __init cgroup_init(void) int __init cgroup_init(void)
{ {
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
unsigned long key;
int ssid; int ssid;
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
/* Add init_css_set to the hash table */ /*
key = css_set_hash(init_css_set.subsys); * Add init_css_set to the hash table so that dfl_root can link to
hash_add(css_set_table, &init_css_set.hlist, key); * it during init.
*/
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
...@@ -5324,10 +5490,16 @@ int __init cgroup_init(void) ...@@ -5324,10 +5490,16 @@ int __init cgroup_init(void)
continue; continue;
} }
if (cgroup_ssid_no_v1(ssid))
printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
ss->name);
cgrp_dfl_root.subsys_mask |= 1 << ss->id; cgrp_dfl_root.subsys_mask |= 1 << ss->id;
if (!ss->dfl_cftypes) if (ss->implicit_on_dfl)
cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
else if (!ss->dfl_cftypes)
cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
if (ss->dfl_cftypes == ss->legacy_cftypes) { if (ss->dfl_cftypes == ss->legacy_cftypes) {
WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
...@@ -5340,6 +5512,11 @@ int __init cgroup_init(void) ...@@ -5340,6 +5512,11 @@ int __init cgroup_init(void)
ss->bind(init_css_set.subsys[ssid]); ss->bind(init_css_set.subsys[ssid]);
} }
/* init_css_set.subsys[] has been updated, re-hash */
hash_del(&init_css_set.hlist);
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup_fs_type));
WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type));
...@@ -5398,7 +5575,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, ...@@ -5398,7 +5575,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct cgroup *cgrp; struct cgroup *cgrp;
int ssid, count = 0; int ssid, count = 0;
if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible) if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
continue; continue;
seq_printf(m, "%d:", root->hierarchy_id); seq_printf(m, "%d:", root->hierarchy_id);
...@@ -5513,11 +5690,11 @@ int cgroup_can_fork(struct task_struct *child) ...@@ -5513,11 +5690,11 @@ int cgroup_can_fork(struct task_struct *child)
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
int i, j, ret; int i, j, ret;
for_each_subsys_which(ss, i, &have_canfork_callback) { do_each_subsys_mask(ss, i, have_canfork_callback) {
ret = ss->can_fork(child); ret = ss->can_fork(child);
if (ret) if (ret)
goto out_revert; goto out_revert;
} } while_each_subsys_mask();
return 0; return 0;
...@@ -5602,8 +5779,9 @@ void cgroup_post_fork(struct task_struct *child) ...@@ -5602,8 +5779,9 @@ void cgroup_post_fork(struct task_struct *child)
* css_set; otherwise, @child might change state between ->fork() * css_set; otherwise, @child might change state between ->fork()
* and addition to css_set. * and addition to css_set.
*/ */
for_each_subsys_which(ss, i, &have_fork_callback) do_each_subsys_mask(ss, i, have_fork_callback) {
ss->fork(child); ss->fork(child);
} while_each_subsys_mask();
} }
/** /**
...@@ -5646,8 +5824,9 @@ void cgroup_exit(struct task_struct *tsk) ...@@ -5646,8 +5824,9 @@ void cgroup_exit(struct task_struct *tsk)
} }
/* see cgroup_post_fork() for details */ /* see cgroup_post_fork() for details */
for_each_subsys_which(ss, i, &have_exit_callback) do_each_subsys_mask(ss, i, have_exit_callback) {
ss->exit(tsk); ss->exit(tsk);
} while_each_subsys_mask();
} }
void cgroup_free(struct task_struct *task) void cgroup_free(struct task_struct *task)
...@@ -5656,8 +5835,9 @@ void cgroup_free(struct task_struct *task) ...@@ -5656,8 +5835,9 @@ void cgroup_free(struct task_struct *task)
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
int ssid; int ssid;
for_each_subsys_which(ss, ssid, &have_free_callback) do_each_subsys_mask(ss, ssid, have_free_callback) {
ss->free(task); ss->free(task);
} while_each_subsys_mask();
put_css_set(cset); put_css_set(cset);
} }
...@@ -5750,6 +5930,33 @@ static int __init cgroup_disable(char *str) ...@@ -5750,6 +5930,33 @@ static int __init cgroup_disable(char *str)
} }
__setup("cgroup_disable=", cgroup_disable); __setup("cgroup_disable=", cgroup_disable);
static int __init cgroup_no_v1(char *str)
{
struct cgroup_subsys *ss;
char *token;
int i;
while ((token = strsep(&str, ",")) != NULL) {
if (!*token)
continue;
if (!strcmp(token, "all")) {
cgroup_no_v1_mask = U16_MAX;
break;
}
for_each_subsys(ss, i) {
if (strcmp(token, ss->name) &&
strcmp(token, ss->legacy_name))
continue;
cgroup_no_v1_mask |= 1 << i;
}
}
return 1;
}
__setup("cgroup_no_v1=", cgroup_no_v1);
/** /**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest * @dentry: directory dentry of interest
...@@ -5763,12 +5970,13 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, ...@@ -5763,12 +5970,13 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup_subsys *ss) struct cgroup_subsys *ss)
{ {
struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
struct file_system_type *s_type = dentry->d_sb->s_type;
struct cgroup_subsys_state *css = NULL; struct cgroup_subsys_state *css = NULL;
struct cgroup *cgrp; struct cgroup *cgrp;
/* is @dentry a cgroup dir? */ /* is @dentry a cgroup dir? */
if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
kernfs_type(kn) != KERNFS_DIR) !kn || kernfs_type(kn) != KERNFS_DIR)
return ERR_PTR(-EBADF); return ERR_PTR(-EBADF);
rcu_read_lock(); rcu_read_lock();
......
...@@ -2089,7 +2089,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { ...@@ -2089,7 +2089,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.attach = cpuset_attach, .attach = cpuset_attach,
.bind = cpuset_bind, .bind = cpuset_bind,
.legacy_cftypes = files, .legacy_cftypes = files,
.early_init = 1, .early_init = true,
}; };
/** /**
......
...@@ -8441,7 +8441,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { ...@@ -8441,7 +8441,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.can_attach = cpu_cgroup_can_attach, .can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach, .attach = cpu_cgroup_attach,
.legacy_cftypes = cpu_files, .legacy_cftypes = cpu_files,
.early_init = 1, .early_init = true,
}; };
#endif /* CONFIG_CGROUP_SCHED */ #endif /* CONFIG_CGROUP_SCHED */
......
...@@ -279,5 +279,5 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { ...@@ -279,5 +279,5 @@ struct cgroup_subsys cpuacct_cgrp_subsys = {
.css_alloc = cpuacct_css_alloc, .css_alloc = cpuacct_css_alloc,
.css_free = cpuacct_css_free, .css_free = cpuacct_css_free,
.legacy_cftypes = files, .legacy_cftypes = files,
.early_init = 1, .early_init = true,
}; };
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment