Commit 6b5f04b6 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:
 "cgroup changes for v4.6-rc1.  No userland visible behavior changes in
  this pull request.  I'll send out a separate pull request for the
  addition of cgroup namespace support.

   - The biggest change is the revamping of cgroup core task migration
     and controller handling logic.  There are quite a few places where
     controllers and tasks are manipulated.  Previously, many of those
     places implemented custom operations for each specific use case
     assuming specific starting conditions.  While this worked, it makes
     the code fragile and difficult to follow.

     The bulk of this pull request restructures these operations so that
     most related operations are performed through common helpers which
     implement recursive (subtrees are always processed consistently)
     and idempotent (they make cgroup hierarchy converge to the target
     state rather than performing operations assuming specific starting
     conditions).  This makes the code a lot easier to understand,
     verify and extend.

   - Implicit controller support is added.  This is primarily for using
     perf_event on the v2 hierarchy so that perf can match cgroup v2
     path without requiring the user to do anything special.  The kernel
     portion of perf_event changes is acked but userland changes are
     still pending review.

   - cgroup_no_v1= boot parameter added to ease testing cgroup v2 in
     certain environments.

   - There is a regression introduced during v4.4 devel cycle where
     attempts to migrate zombie tasks can mess up internal object
     management.  This was fixed earlier this week and included in this
     pull request w/ stable cc'd.

   - Misc non-critical fixes and improvements"

* 'for-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (44 commits)
  cgroup: avoid false positive gcc-6 warning
  cgroup: ignore css_sets associated with dead cgroups during migration
  Documentation: cgroup v2: Trivial heading correction.
  cgroup: implement cgroup_subsys->implicit_on_dfl
  cgroup: use css_set->mg_dst_cgrp for the migration target cgroup
  cgroup: make cgroup[_taskset]_migrate() take cgroup_root instead of cgroup
  cgroup: move migration destination verification out of cgroup_migrate_prepare_dst()
  cgroup: fix incorrect destination cgroup in cgroup_update_dfl_csses()
  cgroup: Trivial correction to reflect controller.
  cgroup: remove stale item in cgroup-v1 document INDEX file.
  cgroup: update css iteration in cgroup_update_dfl_csses()
  cgroup: allocate 2x cgrp_cset_links when setting up a new root
  cgroup: make cgroup_calc_subtree_ss_mask() take @this_ss_mask
  cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends
  cgroup: use cgroup_apply_enable_control() in cgroup creation path
  cgroup: combine cgroup_mutex locking and offline css draining
  cgroup: factor out cgroup_{apply|finalize}_control() from cgroup_subtree_control_write()
  cgroup: introduce cgroup_{save|propagate|restore}_control()
  cgroup: make cgroup_drain_offline() and cgroup_apply_control_{disable|enable}() recursive
  cgroup: factor out cgroup_apply_control_enable() from cgroup_subtree_control_write()
  ...
parents fcab86ad cfe02a8a
...@@ -24,5 +24,3 @@ net_prio.txt ...@@ -24,5 +24,3 @@ net_prio.txt
- Network priority cgroups details and usages. - Network priority cgroups details and usages.
pids.txt pids.txt
- Process number cgroups details and usages. - Process number cgroups details and usages.
unified-hierarchy.txt
- Description the new/next cgroup interface.
...@@ -132,6 +132,12 @@ strongly discouraged for production use. It is recommended to decide ...@@ -132,6 +132,12 @@ strongly discouraged for production use. It is recommended to decide
the hierarchies and controller associations before starting using the the hierarchies and controller associations before starting using the
controllers after system boot. controllers after system boot.
During transition to v2, system management software might still
automount the v1 cgroup filesystem and so hijack all controllers
during boot, before manual intervention is possible. To make testing
and experimenting easier, the kernel parameter cgroup_no_v1= allows
disabling controllers in v1 and make them always available in v2.
2-2. Organizing Processes 2-2. Organizing Processes
...@@ -915,7 +921,7 @@ PAGE_SIZE multiple when read back. ...@@ -915,7 +921,7 @@ PAGE_SIZE multiple when read back.
limit, anonymous meomry of the cgroup will not be swapped out. limit, anonymous meomry of the cgroup will not be swapped out.
5-2-2. General Usage 5-2-2. Usage Guidelines
"memory.high" is the main mechanism to control memory usage. "memory.high" is the main mechanism to control memory usage.
Over-committing on high limit (sum of high limits > available memory) Over-committing on high limit (sum of high limits > available memory)
......
...@@ -614,6 +614,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ...@@ -614,6 +614,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
cut the overhead, others just disable the usage. So cut the overhead, others just disable the usage. So
only cgroup_disable=memory is actually worthy} only cgroup_disable=memory is actually worthy}
cgroup_no_v1= [KNL] Disable one, multiple, all cgroup controllers in v1
Format: { controller[,controller...] | "all" }
Like cgroup_disable, but only applies to cgroup v1;
the blacklisted controllers remain available in cgroup2.
cgroup.memory= [KNL] Pass options to the cgroup memory controller. cgroup.memory= [KNL] Pass options to the cgroup memory controller.
Format: <string> Format: <string>
nosocket -- Disable socket memory accounting. nosocket -- Disable socket memory accounting.
......
...@@ -45,6 +45,7 @@ enum { ...@@ -45,6 +45,7 @@ enum {
CSS_NO_REF = (1 << 0), /* no reference counting for this css */ CSS_NO_REF = (1 << 0), /* no reference counting for this css */
CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */
CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */
CSS_VISIBLE = (1 << 3), /* css is visible to userland */
}; };
/* bits in struct cgroup flags field */ /* bits in struct cgroup flags field */
...@@ -190,12 +191,13 @@ struct css_set { ...@@ -190,12 +191,13 @@ struct css_set {
/* /*
* If this cset is acting as the source of migration the following * If this cset is acting as the source of migration the following
* two fields are set. mg_src_cgrp is the source cgroup of the * two fields are set. mg_src_cgrp and mg_dst_cgrp are
* on-going migration and mg_dst_cset is the destination cset the * respectively the source and destination cgroups of the on-going
* target tasks on this cset should be migrated to. Protected by * migration. mg_dst_cset is the destination cset the target tasks
* cgroup_mutex. * on this cset should be migrated to. Protected by cgroup_mutex.
*/ */
struct cgroup *mg_src_cgrp; struct cgroup *mg_src_cgrp;
struct cgroup *mg_dst_cgrp;
struct css_set *mg_dst_cset; struct css_set *mg_dst_cset;
/* /*
...@@ -210,6 +212,9 @@ struct css_set { ...@@ -210,6 +212,9 @@ struct css_set {
/* all css_task_iters currently walking this cset */ /* all css_task_iters currently walking this cset */
struct list_head task_iters; struct list_head task_iters;
/* dead and being drained, ignore for migration */
bool dead;
/* For RCU-protected deletion */ /* For RCU-protected deletion */
struct rcu_head rcu_head; struct rcu_head rcu_head;
}; };
...@@ -253,13 +258,14 @@ struct cgroup { ...@@ -253,13 +258,14 @@ struct cgroup {
/* /*
* The bitmask of subsystems enabled on the child cgroups. * The bitmask of subsystems enabled on the child cgroups.
* ->subtree_control is the one configured through * ->subtree_control is the one configured through
* "cgroup.subtree_control" while ->child_subsys_mask is the * "cgroup.subtree_control" while ->child_ss_mask is the effective
* effective one which may have more subsystems enabled. * one which may have more subsystems enabled. Controller knobs
* Controller knobs are made available iff it's enabled in * are made available iff it's enabled in ->subtree_control.
* ->subtree_control.
*/ */
unsigned int subtree_control; u16 subtree_control;
unsigned int child_subsys_mask; u16 subtree_ss_mask;
u16 old_subtree_control;
u16 old_subtree_ss_mask;
/* Private pointers for each registered subsystem */ /* Private pointers for each registered subsystem */
struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
...@@ -434,7 +440,6 @@ struct cgroup_subsys { ...@@ -434,7 +440,6 @@ struct cgroup_subsys {
void (*css_released)(struct cgroup_subsys_state *css); void (*css_released)(struct cgroup_subsys_state *css);
void (*css_free)(struct cgroup_subsys_state *css); void (*css_free)(struct cgroup_subsys_state *css);
void (*css_reset)(struct cgroup_subsys_state *css); void (*css_reset)(struct cgroup_subsys_state *css);
void (*css_e_css_changed)(struct cgroup_subsys_state *css);
int (*can_attach)(struct cgroup_taskset *tset); int (*can_attach)(struct cgroup_taskset *tset);
void (*cancel_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset);
...@@ -446,7 +451,20 @@ struct cgroup_subsys { ...@@ -446,7 +451,20 @@ struct cgroup_subsys {
void (*free)(struct task_struct *task); void (*free)(struct task_struct *task);
void (*bind)(struct cgroup_subsys_state *root_css); void (*bind)(struct cgroup_subsys_state *root_css);
int early_init; bool early_init:1;
/*
* If %true, the controller, on the default hierarchy, doesn't show
* up in "cgroup.controllers" or "cgroup.subtree_control", is
* implicitly enabled on all cgroups on the default hierarchy, and
* bypasses the "no internal process" constraint. This is for
* utility type controllers which is transparent to userland.
*
* An implicit controller can be stolen from the default hierarchy
* anytime and thus must be okay with offline csses from previous
* hierarchies coexisting with csses for the current one.
*/
bool implicit_on_dfl:1;
/* /*
* If %false, this subsystem is properly hierarchical - * If %false, this subsystem is properly hierarchical -
...@@ -460,8 +478,8 @@ struct cgroup_subsys { ...@@ -460,8 +478,8 @@ struct cgroup_subsys {
* cases. Eventually, all subsystems will be made properly * cases. Eventually, all subsystems will be made properly
* hierarchical and this will go away. * hierarchical and this will go away.
*/ */
bool broken_hierarchy; bool broken_hierarchy:1;
bool warned_broken_hierarchy; bool warned_broken_hierarchy:1;
/* the following two fields are initialized automtically during boot */ /* the following two fields are initialized automtically during boot */
int id; int id;
......
...@@ -1047,10 +1047,10 @@ config CGROUP_PIDS ...@@ -1047,10 +1047,10 @@ config CGROUP_PIDS
is fairly trivial to reach PID exhaustion before you reach even a is fairly trivial to reach PID exhaustion before you reach even a
conservative kmemcg limit. As a result, it is possible to grind a conservative kmemcg limit. As a result, it is possible to grind a
system to halt without being limited by other cgroup policies. The system to halt without being limited by other cgroup policies. The
PIDs cgroup subsystem is designed to stop this from happening. PIDs controller is designed to stop this from happening.
It should be noted that organisational operations (such as attaching It should be noted that organisational operations (such as attaching
to a cgroup hierarchy will *not* be blocked by the PIDs subsystem), to a cgroup hierarchy will *not* be blocked by the PIDs controller),
since the PIDs limit only affects a process's ability to fork, not to since the PIDs limit only affects a process's ability to fork, not to
attach to a cgroup. attach to a cgroup.
......
...@@ -14,8 +14,7 @@ obj-y = fork.o exec_domain.o panic.o \ ...@@ -14,8 +14,7 @@ obj-y = fork.o exec_domain.o panic.o \
obj-$(CONFIG_MULTIUSER) += groups.o obj-$(CONFIG_MULTIUSER) += groups.o
ifdef CONFIG_FUNCTION_TRACER ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files # Do not trace internal ftrace files
CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
endif endif
......
This diff is collapsed.
...@@ -2089,7 +2089,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { ...@@ -2089,7 +2089,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.attach = cpuset_attach, .attach = cpuset_attach,
.bind = cpuset_bind, .bind = cpuset_bind,
.legacy_cftypes = files, .legacy_cftypes = files,
.early_init = 1, .early_init = true,
}; };
/** /**
......
...@@ -8441,7 +8441,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { ...@@ -8441,7 +8441,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.can_attach = cpu_cgroup_can_attach, .can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach, .attach = cpu_cgroup_attach,
.legacy_cftypes = cpu_files, .legacy_cftypes = cpu_files,
.early_init = 1, .early_init = true,
}; };
#endif /* CONFIG_CGROUP_SCHED */ #endif /* CONFIG_CGROUP_SCHED */
......
...@@ -279,5 +279,5 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { ...@@ -279,5 +279,5 @@ struct cgroup_subsys cpuacct_cgrp_subsys = {
.css_alloc = cpuacct_css_alloc, .css_alloc = cpuacct_css_alloc,
.css_free = cpuacct_css_free, .css_free = cpuacct_css_free,
.legacy_cftypes = files, .legacy_cftypes = files,
.early_init = 1, .early_init = true,
}; };
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment