Commit 0011572c authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-5.2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:
 "This has an unusually high density of tricky fixes:

   - task_get_css() could deadlock when it races against a dying cgroup.

   - cgroup.procs didn't list thread group leaders with live threads.

     This could mislead readers to think that a cgroup is empty when
     it's not. Fixed by making PROCS iterator include dead tasks. I made
     a couple mistakes making this change and this pull request contains
     a couple follow-up patches.

   - When cpusets run out of online cpus, it updates cpusmasks of member
     tasks in bizarre ways. Joel improved the behavior significantly"

* 'for-5.2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cpuset: restore sanity to cpuset_cpus_allowed_fallback()
  cgroup: Fix css_task_iter_advance_css_set() cset skip condition
  cgroup: css_task_iter_skip()'d iterators must be advanced before accessed
  cgroup: Include dying leaders with live threads in PROCS iterations
  cgroup: Implement css_task_iter_skip()
  cgroup: Call cgroup_release() before __exit_signal()
  docs cgroups: add another example size for hugetlb
  cgroup: Use css_tryget() instead of css_tryget_online() in task_get_css()
parents 6aa7a22b d477f8c2
...@@ -32,14 +32,18 @@ Brief summary of control files ...@@ -32,14 +32,18 @@ Brief summary of control files
hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit
For a system supporting two hugepage size (16M and 16G) the control For a system supporting three hugepage sizes (64k, 32M and 1G), the control
files include: files include:
hugetlb.16GB.limit_in_bytes hugetlb.1GB.limit_in_bytes
hugetlb.16GB.max_usage_in_bytes hugetlb.1GB.max_usage_in_bytes
hugetlb.16GB.usage_in_bytes hugetlb.1GB.usage_in_bytes
hugetlb.16GB.failcnt hugetlb.1GB.failcnt
hugetlb.16MB.limit_in_bytes hugetlb.64KB.limit_in_bytes
hugetlb.16MB.max_usage_in_bytes hugetlb.64KB.max_usage_in_bytes
hugetlb.16MB.usage_in_bytes hugetlb.64KB.usage_in_bytes
hugetlb.16MB.failcnt hugetlb.64KB.failcnt
hugetlb.32MB.limit_in_bytes
hugetlb.32MB.max_usage_in_bytes
hugetlb.32MB.usage_in_bytes
hugetlb.32MB.failcnt
...@@ -221,6 +221,7 @@ struct css_set { ...@@ -221,6 +221,7 @@ struct css_set {
*/ */
struct list_head tasks; struct list_head tasks;
struct list_head mg_tasks; struct list_head mg_tasks;
struct list_head dying_tasks;
/* all css_task_iters currently walking this cset */ /* all css_task_iters currently walking this cset */
struct list_head task_iters; struct list_head task_iters;
......
...@@ -43,6 +43,9 @@ ...@@ -43,6 +43,9 @@
/* walk all threaded css_sets in the domain */ /* walk all threaded css_sets in the domain */
#define CSS_TASK_ITER_THREADED (1U << 1) #define CSS_TASK_ITER_THREADED (1U << 1)
/* internal flags */
#define CSS_TASK_ITER_SKIPPED (1U << 16)
/* a css_task_iter should be treated as an opaque object */ /* a css_task_iter should be treated as an opaque object */
struct css_task_iter { struct css_task_iter {
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
...@@ -57,6 +60,7 @@ struct css_task_iter { ...@@ -57,6 +60,7 @@ struct css_task_iter {
struct list_head *task_pos; struct list_head *task_pos;
struct list_head *tasks_head; struct list_head *tasks_head;
struct list_head *mg_tasks_head; struct list_head *mg_tasks_head;
struct list_head *dying_tasks_head;
struct css_set *cur_cset; struct css_set *cur_cset;
struct css_set *cur_dcset; struct css_set *cur_dcset;
...@@ -487,7 +491,7 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, ...@@ -487,7 +491,7 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
* *
* Find the css for the (@task, @subsys_id) combination, increment a * Find the css for the (@task, @subsys_id) combination, increment a
* reference on and return it. This function is guaranteed to return a * reference on and return it. This function is guaranteed to return a
* valid css. * valid css. The returned css may already have been offlined.
*/ */
static inline struct cgroup_subsys_state * static inline struct cgroup_subsys_state *
task_get_css(struct task_struct *task, int subsys_id) task_get_css(struct task_struct *task, int subsys_id)
...@@ -497,7 +501,13 @@ task_get_css(struct task_struct *task, int subsys_id) ...@@ -497,7 +501,13 @@ task_get_css(struct task_struct *task, int subsys_id)
rcu_read_lock(); rcu_read_lock();
while (true) { while (true) {
css = task_css(task, subsys_id); css = task_css(task, subsys_id);
if (likely(css_tryget_online(css))) /*
* Can't use css_tryget_online() here. A task which has
* PF_EXITING set may stay associated with an offline css.
* If such task calls this function, css_tryget_online()
* will keep failing.
*/
if (likely(css_tryget(css)))
break; break;
cpu_relax(); cpu_relax();
} }
......
...@@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[]; ...@@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[];
static int cgroup_apply_control(struct cgroup *cgrp); static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_advance(struct css_task_iter *it); static void css_task_iter_skip(struct css_task_iter *it,
struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
struct cgroup_subsys *ss); struct cgroup_subsys *ss);
...@@ -738,6 +739,7 @@ struct css_set init_css_set = { ...@@ -738,6 +739,7 @@ struct css_set init_css_set = {
.dom_cset = &init_css_set, .dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks), .tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
...@@ -843,6 +845,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated) ...@@ -843,6 +845,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
cgroup_update_populated(link->cgrp, populated); cgroup_update_populated(link->cgrp, populated);
} }
/*
* @task is leaving, advance task iterators which are pointing to it so
* that they can resume at the next position. Advancing an iterator might
* remove it from the list, use safe walk. See css_task_iter_skip() for
* details.
*/
static void css_set_skip_task_iters(struct css_set *cset,
struct task_struct *task)
{
struct css_task_iter *it, *pos;
list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
css_task_iter_skip(it, task);
}
/** /**
* css_set_move_task - move a task from one css_set to another * css_set_move_task - move a task from one css_set to another
* @task: task being moved * @task: task being moved
...@@ -868,22 +885,9 @@ static void css_set_move_task(struct task_struct *task, ...@@ -868,22 +885,9 @@ static void css_set_move_task(struct task_struct *task,
css_set_update_populated(to_cset, true); css_set_update_populated(to_cset, true);
if (from_cset) { if (from_cset) {
struct css_task_iter *it, *pos;
WARN_ON_ONCE(list_empty(&task->cg_list)); WARN_ON_ONCE(list_empty(&task->cg_list));
/* css_set_skip_task_iters(from_cset, task);
* @task is leaving, advance task iterators which are
* pointing to it so that they can resume at the next
* position. Advancing an iterator might remove it from
* the list, use safe walk. See css_task_iter_advance*()
* for details.
*/
list_for_each_entry_safe(it, pos, &from_cset->task_iters,
iters_node)
if (it->task_pos == &task->cg_list)
css_task_iter_advance(it);
list_del_init(&task->cg_list); list_del_init(&task->cg_list);
if (!css_set_populated(from_cset)) if (!css_set_populated(from_cset))
css_set_update_populated(from_cset, false); css_set_update_populated(from_cset, false);
...@@ -1210,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, ...@@ -1210,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
cset->dom_cset = cset; cset->dom_cset = cset;
INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->dying_tasks);
INIT_LIST_HEAD(&cset->task_iters); INIT_LIST_HEAD(&cset->task_iters);
INIT_LIST_HEAD(&cset->threaded_csets); INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist); INIT_HLIST_NODE(&cset->hlist);
...@@ -4408,15 +4413,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) ...@@ -4408,15 +4413,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
it->task_pos = NULL; it->task_pos = NULL;
return; return;
} }
} while (!css_set_populated(cset)); } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
if (!list_empty(&cset->tasks)) if (!list_empty(&cset->tasks))
it->task_pos = cset->tasks.next; it->task_pos = cset->tasks.next;
else else if (!list_empty(&cset->mg_tasks))
it->task_pos = cset->mg_tasks.next; it->task_pos = cset->mg_tasks.next;
else
it->task_pos = cset->dying_tasks.next;
it->tasks_head = &cset->tasks; it->tasks_head = &cset->tasks;
it->mg_tasks_head = &cset->mg_tasks; it->mg_tasks_head = &cset->mg_tasks;
it->dying_tasks_head = &cset->dying_tasks;
/* /*
* We don't keep css_sets locked across iteration steps and thus * We don't keep css_sets locked across iteration steps and thus
...@@ -4442,9 +4450,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) ...@@ -4442,9 +4450,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
list_add(&it->iters_node, &cset->task_iters); list_add(&it->iters_node, &cset->task_iters);
} }
static void css_task_iter_skip(struct css_task_iter *it,
struct task_struct *task)
{
lockdep_assert_held(&css_set_lock);
if (it->task_pos == &task->cg_list) {
it->task_pos = it->task_pos->next;
it->flags |= CSS_TASK_ITER_SKIPPED;
}
}
static void css_task_iter_advance(struct css_task_iter *it) static void css_task_iter_advance(struct css_task_iter *it)
{ {
struct list_head *next; struct task_struct *task;
lockdep_assert_held(&css_set_lock); lockdep_assert_held(&css_set_lock);
repeat: repeat:
...@@ -4454,25 +4473,40 @@ static void css_task_iter_advance(struct css_task_iter *it) ...@@ -4454,25 +4473,40 @@ static void css_task_iter_advance(struct css_task_iter *it)
* consumed first and then ->mg_tasks. After ->mg_tasks, * consumed first and then ->mg_tasks. After ->mg_tasks,
* we move onto the next cset. * we move onto the next cset.
*/ */
next = it->task_pos->next; if (it->flags & CSS_TASK_ITER_SKIPPED)
it->flags &= ~CSS_TASK_ITER_SKIPPED;
if (next == it->tasks_head) else
next = it->mg_tasks_head->next; it->task_pos = it->task_pos->next;
if (next == it->mg_tasks_head) if (it->task_pos == it->tasks_head)
it->task_pos = it->mg_tasks_head->next;
if (it->task_pos == it->mg_tasks_head)
it->task_pos = it->dying_tasks_head->next;
if (it->task_pos == it->dying_tasks_head)
css_task_iter_advance_css_set(it); css_task_iter_advance_css_set(it);
else
it->task_pos = next;
} else { } else {
/* called from start, proceed to the first cset */ /* called from start, proceed to the first cset */
css_task_iter_advance_css_set(it); css_task_iter_advance_css_set(it);
} }
if (!it->task_pos)
return;
task = list_entry(it->task_pos, struct task_struct, cg_list);
if (it->flags & CSS_TASK_ITER_PROCS) {
/* if PROCS, skip over tasks which aren't group leaders */ /* if PROCS, skip over tasks which aren't group leaders */
if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && if (!thread_group_leader(task))
!thread_group_leader(list_entry(it->task_pos, struct task_struct, goto repeat;
cg_list)))
/* and dying leaders w/o live member threads */
if (!atomic_read(&task->signal->live))
goto repeat;
} else {
/* skip all dying ones */
if (task->flags & PF_EXITING)
goto repeat; goto repeat;
}
} }
/** /**
...@@ -4528,6 +4562,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) ...@@ -4528,6 +4562,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
spin_lock_irq(&css_set_lock); spin_lock_irq(&css_set_lock);
/* @it may be half-advanced by skips, finish advancing */
if (it->flags & CSS_TASK_ITER_SKIPPED)
css_task_iter_advance(it);
if (it->task_pos) { if (it->task_pos) {
it->cur_task = list_entry(it->task_pos, struct task_struct, it->cur_task = list_entry(it->task_pos, struct task_struct,
cg_list); cg_list);
...@@ -6009,6 +6047,7 @@ void cgroup_exit(struct task_struct *tsk) ...@@ -6009,6 +6047,7 @@ void cgroup_exit(struct task_struct *tsk)
if (!list_empty(&tsk->cg_list)) { if (!list_empty(&tsk->cg_list)) {
spin_lock_irq(&css_set_lock); spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false); css_set_move_task(tsk, cset, NULL, false);
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--; cset->nr_tasks--;
WARN_ON_ONCE(cgroup_task_frozen(tsk)); WARN_ON_ONCE(cgroup_task_frozen(tsk));
...@@ -6034,6 +6073,13 @@ void cgroup_release(struct task_struct *task) ...@@ -6034,6 +6073,13 @@ void cgroup_release(struct task_struct *task)
do_each_subsys_mask(ss, ssid, have_release_callback) { do_each_subsys_mask(ss, ssid, have_release_callback) {
ss->release(task); ss->release(task);
} while_each_subsys_mask(); } while_each_subsys_mask();
if (use_task_css_set_links) {
spin_lock_irq(&css_set_lock);
css_set_skip_task_iters(task_css_set(task), task);
list_del_init(&task->cg_list);
spin_unlock_irq(&css_set_lock);
}
} }
void cgroup_free(struct task_struct *task) void cgroup_free(struct task_struct *task)
......
...@@ -3254,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) ...@@ -3254,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
spin_unlock_irqrestore(&callback_lock, flags); spin_unlock_irqrestore(&callback_lock, flags);
} }
/**
* cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
* @tsk: pointer to task_struct with which the scheduler is struggling
*
* Description: In the case that the scheduler cannot find an allowed cpu in
* tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
* mode however, this value is the same as task_cs(tsk)->effective_cpus,
* which will not contain a sane cpumask during cases such as cpu hotplugging.
* This is the absolute last resort for the scheduler and it is only used if
* _every_ other avenue has been traveled.
**/
void cpuset_cpus_allowed_fallback(struct task_struct *tsk) void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{ {
rcu_read_lock(); rcu_read_lock();
do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); do_set_cpus_allowed(tsk, is_in_v2_mode() ?
task_cs(tsk)->cpus_allowed : cpu_possible_mask);
rcu_read_unlock(); rcu_read_unlock();
/* /*
......
...@@ -195,6 +195,7 @@ void release_task(struct task_struct *p) ...@@ -195,6 +195,7 @@ void release_task(struct task_struct *p)
rcu_read_unlock(); rcu_read_unlock();
proc_flush_task(p); proc_flush_task(p);
cgroup_release(p);
write_lock_irq(&tasklist_lock); write_lock_irq(&tasklist_lock);
ptrace_release_task(p); ptrace_release_task(p);
...@@ -220,7 +221,6 @@ void release_task(struct task_struct *p) ...@@ -220,7 +221,6 @@ void release_task(struct task_struct *p)
} }
write_unlock_irq(&tasklist_lock); write_unlock_irq(&tasklist_lock);
cgroup_release(p);
release_thread(p); release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct); call_rcu(&p->rcu, delayed_put_task_struct);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment