Commit 5c5cc623 authored by Li Zefan's avatar Li Zefan Committed by Tejun Heo

cpuset: allow to keep tasks in empty cpusets

To achieve this:

- We call update_tasks_cpumask/nodemask() for empty cpusets when
hotplug happens, instead of moving tasks out of them.

- When a cpuset's masks are changed by writing cpuset.cpus/mems,
we also update tasks in child cpusets which are empty.

v3:
- do propagation work in one place for both hotplug and unplug

v2:
- drop rcu_read_lock before calling update_task_nodemask() and
  update_task_cpumask(), instead of using workqueue.
- add documentation in include/linux/cgroup.h
Signed-off-by: default avatarLi Zefan <lizefan@huawei.com>
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
parent 070b57fc
...@@ -277,6 +277,10 @@ enum { ...@@ -277,6 +277,10 @@ enum {
* *
* - Remount is disallowed. * - Remount is disallowed.
* *
* - cpuset: tasks will be kept in empty cpusets when hotplug happens
* and take masks of ancestors with non-empty cpus/mems, instead of
* being moved to an ancestor.
*
* - memcg: use_hierarchy is on by default and the cgroup file for * - memcg: use_hierarchy is on by default and the cgroup file for
* the flag is not created. * the flag is not created.
* *
......
...@@ -874,6 +874,45 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) ...@@ -874,6 +874,45 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
cgroup_scan_tasks(&scan); cgroup_scan_tasks(&scan);
} }
/*
* update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
* @root_cs: the root cpuset of the hierarchy
* @update_root: update root cpuset or not?
* @heap: the heap used by cgroup_scan_tasks()
*
* This will update cpumasks of tasks in @root_cs and all other empty cpusets
* which take on cpumask of @root_cs.
*
* Called with cpuset_mutex held
*/
static void update_tasks_cpumask_hier(struct cpuset *root_cs,
bool update_root, struct ptr_heap *heap)
{
struct cpuset *cp;
struct cgroup *pos_cgrp;
if (update_root)
update_tasks_cpumask(root_cs, heap);
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
/* skip the whole subtree if @cp have some CPU */
if (!cpumask_empty(cp->cpus_allowed)) {
pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
continue;
}
if (!css_tryget(&cp->css))
continue;
rcu_read_unlock();
update_tasks_cpumask(cp, heap);
rcu_read_lock();
css_put(&cp->css);
}
rcu_read_unlock();
}
/** /**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider * @cs: the cpuset to consider
...@@ -925,11 +964,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, ...@@ -925,11 +964,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(&callback_mutex); mutex_unlock(&callback_mutex);
/* update_tasks_cpumask_hier(cs, true, &heap);
* Scan tasks in the cpuset, and update the cpumasks of any
* that need an update.
*/
update_tasks_cpumask(cs, &heap);
heap_free(&heap); heap_free(&heap);
...@@ -1096,6 +1131,45 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) ...@@ -1096,6 +1131,45 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
cpuset_being_rebound = NULL; cpuset_being_rebound = NULL;
} }
/*
* update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
* @cs: the root cpuset of the hierarchy
* @update_root: update the root cpuset or not?
* @heap: the heap used by cgroup_scan_tasks()
*
* This will update nodemasks of tasks in @root_cs and all other empty cpusets
* which take on nodemask of @root_cs.
*
* Called with cpuset_mutex held
*/
static void update_tasks_nodemask_hier(struct cpuset *root_cs,
bool update_root, struct ptr_heap *heap)
{
struct cpuset *cp;
struct cgroup *pos_cgrp;
if (update_root)
update_tasks_nodemask(root_cs, heap);
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
/* skip the whole subtree if @cp have some CPU */
if (!nodes_empty(cp->mems_allowed)) {
pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
continue;
}
if (!css_tryget(&cp->css))
continue;
rcu_read_unlock();
update_tasks_nodemask(cp, heap);
rcu_read_lock();
css_put(&cp->css);
}
rcu_read_unlock();
}
/* /*
* Handle user request to change the 'mems' memory placement * Handle user request to change the 'mems' memory placement
* of a cpuset. Needs to validate the request, update the * of a cpuset. Needs to validate the request, update the
...@@ -1160,7 +1234,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, ...@@ -1160,7 +1234,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
cs->mems_allowed = trialcs->mems_allowed; cs->mems_allowed = trialcs->mems_allowed;
mutex_unlock(&callback_mutex); mutex_unlock(&callback_mutex);
update_tasks_nodemask(cs, &heap); update_tasks_nodemask_hier(cs, true, &heap);
heap_free(&heap); heap_free(&heap);
done: done:
...@@ -2048,6 +2122,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) ...@@ -2048,6 +2122,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs)
static cpumask_t off_cpus; static cpumask_t off_cpus;
static nodemask_t off_mems; static nodemask_t off_mems;
bool is_empty; bool is_empty;
bool sane = cgroup_sane_behavior(cs->css.cgroup);
retry: retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
...@@ -2066,21 +2141,29 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) ...@@ -2066,21 +2141,29 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs)
cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
/* remove offline cpus from @cs */ mutex_lock(&callback_mutex);
if (!cpumask_empty(&off_cpus)) { cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
mutex_lock(&callback_mutex); mutex_unlock(&callback_mutex);
cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
mutex_unlock(&callback_mutex); /*
* If sane_behavior flag is set, we need to update tasks' cpumask
* for empty cpuset to take on ancestor's cpumask.
*/
if ((sane && cpumask_empty(cs->cpus_allowed)) ||
!cpumask_empty(&off_cpus))
update_tasks_cpumask(cs, NULL); update_tasks_cpumask(cs, NULL);
}
/* remove offline mems from @cs */ mutex_lock(&callback_mutex);
if (!nodes_empty(off_mems)) { nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
mutex_lock(&callback_mutex); mutex_unlock(&callback_mutex);
nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
mutex_unlock(&callback_mutex); /*
* If sane_behavior flag is set, we need to update tasks' nodemask
* for empty cpuset to take on ancestor's nodemask.
*/
if ((sane && nodes_empty(cs->mems_allowed)) ||
!nodes_empty(off_mems))
update_tasks_nodemask(cs, NULL); update_tasks_nodemask(cs, NULL);
}
is_empty = cpumask_empty(cs->cpus_allowed) || is_empty = cpumask_empty(cs->cpus_allowed) ||
nodes_empty(cs->mems_allowed); nodes_empty(cs->mems_allowed);
...@@ -2088,11 +2171,13 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) ...@@ -2088,11 +2171,13 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs)
mutex_unlock(&cpuset_mutex); mutex_unlock(&cpuset_mutex);
/* /*
* If @cs became empty, move tasks to the nearest ancestor with * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
* execution resources. This is full cgroup operation which will *
* Otherwise move tasks to the nearest ancestor with execution
* resources. This is full cgroup operation which will
* also call back into cpuset. Should be done outside any lock. * also call back into cpuset. Should be done outside any lock.
*/ */
if (is_empty) if (!sane && is_empty)
remove_tasks_in_empty_cpuset(cs); remove_tasks_in_empty_cpuset(cs);
} }
...@@ -2114,10 +2199,9 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) ...@@ -2114,10 +2199,9 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs)
*/ */
static void cpuset_hotplug_workfn(struct work_struct *work) static void cpuset_hotplug_workfn(struct work_struct *work)
{ {
static cpumask_t new_cpus, tmp_cpus; static cpumask_t new_cpus;
static nodemask_t new_mems, tmp_mems; static nodemask_t new_mems;
bool cpus_updated, mems_updated; bool cpus_updated, mems_updated;
bool cpus_offlined, mems_offlined;
mutex_lock(&cpuset_mutex); mutex_lock(&cpuset_mutex);
...@@ -2126,12 +2210,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) ...@@ -2126,12 +2210,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
new_mems = node_states[N_MEMORY]; new_mems = node_states[N_MEMORY];
cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
&new_cpus);
mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
mems_offlined = !nodes_empty(tmp_mems);
/* synchronize cpus_allowed to cpu_active_mask */ /* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) { if (cpus_updated) {
...@@ -2151,8 +2230,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work) ...@@ -2151,8 +2230,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
mutex_unlock(&cpuset_mutex); mutex_unlock(&cpuset_mutex);
/* if cpus or mems went down, we need to propagate to descendants */ /* if cpus or mems changed, we need to propagate to descendants */
if (cpus_offlined || mems_offlined) { if (cpus_updated || mems_updated) {
struct cpuset *cs; struct cpuset *cs;
struct cgroup *pos_cgrp; struct cgroup *pos_cgrp;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment