Commit fb0dc5f1 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-4.5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:

 - The destruction path of cgroup objects are asynchronous and
   multi-staged and some of them ended up destroying parents before
   children leading to failures in cpu and memory controllers.  Ensure
   that parents are always destroyed after children.

 - cpuset mm node migration was performed synchronously while holding
   threadgroup and cgroup mutexes and the recent threadgroup locking
   update resulted in a possible deadlock.  The migration is best effort
   and shouldn't have been performed under those locks to begin with.
   Made asynchronous.

 - Minor documentation fix.

* 'for-4.5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  Documentation: cgroup: Fix 'cgroup-legacy' -> 'cgroup-v1'
  cgroup: make sure a parent css isn't freed before its children
  cgroup: make sure a parent css isn't offlined before its children
  cpuset: make mm migration asynchronous
parents 9aece75c 9a2ddda5
...@@ -7,7 +7,7 @@ This is the authoritative documentation on the design, interface and ...@@ -7,7 +7,7 @@ This is the authoritative documentation on the design, interface and
conventions of cgroup v2. It describes all userland-visible aspects conventions of cgroup v2. It describes all userland-visible aspects
of cgroup including core and specific controller behaviors. All of cgroup including core and specific controller behaviors. All
future changes must be reflected in this document. Documentation for future changes must be reflected in this document. Documentation for
v1 is available under Documentation/cgroup-legacy/. v1 is available under Documentation/cgroup-v1/.
CONTENTS CONTENTS
......
...@@ -127,6 +127,12 @@ struct cgroup_subsys_state { ...@@ -127,6 +127,12 @@ struct cgroup_subsys_state {
*/ */
u64 serial_nr; u64 serial_nr;
/*
* Incremented by online self and children. Used to guarantee that
* parents are not offlined before their children.
*/
atomic_t online_cnt;
/* percpu_ref killing and RCU release */ /* percpu_ref killing and RCU release */
struct rcu_head rcu_head; struct rcu_head rcu_head;
struct work_struct destroy_work; struct work_struct destroy_work;
......
...@@ -137,6 +137,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) ...@@ -137,6 +137,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
task_unlock(current); task_unlock(current);
} }
extern void cpuset_post_attach_flush(void);
#else /* !CONFIG_CPUSETS */ #else /* !CONFIG_CPUSETS */
static inline bool cpusets_enabled(void) { return false; } static inline bool cpusets_enabled(void) { return false; }
...@@ -243,6 +245,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq) ...@@ -243,6 +245,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
return false; return false;
} }
static inline void cpuset_post_attach_flush(void)
{
}
#endif /* !CONFIG_CPUSETS */ #endif /* !CONFIG_CPUSETS */
#endif /* _LINUX_CPUSET_H */ #endif /* _LINUX_CPUSET_H */
...@@ -58,6 +58,7 @@ ...@@ -58,6 +58,7 @@
#include <linux/kthread.h> #include <linux/kthread.h>
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/atomic.h> #include <linux/atomic.h>
#include <linux/cpuset.h>
#include <net/sock.h> #include <net/sock.h>
/* /*
...@@ -2739,6 +2740,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, ...@@ -2739,6 +2740,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
out_unlock_threadgroup: out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem); percpu_up_write(&cgroup_threadgroup_rwsem);
cgroup_kn_unlock(of->kn); cgroup_kn_unlock(of->kn);
cpuset_post_attach_flush();
return ret ?: nbytes; return ret ?: nbytes;
} }
...@@ -4655,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work) ...@@ -4655,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work)
if (ss) { if (ss) {
/* css free path */ /* css free path */
struct cgroup_subsys_state *parent = css->parent;
int id = css->id; int id = css->id;
if (css->parent)
css_put(css->parent);
ss->css_free(css); ss->css_free(css);
cgroup_idr_remove(&ss->css_idr, id); cgroup_idr_remove(&ss->css_idr, id);
cgroup_put(cgrp); cgroup_put(cgrp);
if (parent)
css_put(parent);
} else { } else {
/* cgroup free path */ /* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps); atomic_dec(&cgrp->root->nr_cgrps);
...@@ -4758,6 +4761,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, ...@@ -4758,6 +4761,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children); INIT_LIST_HEAD(&css->children);
css->serial_nr = css_serial_nr_next++; css->serial_nr = css_serial_nr_next++;
atomic_set(&css->online_cnt, 0);
if (cgroup_parent(cgrp)) { if (cgroup_parent(cgrp)) {
css->parent = cgroup_css(cgroup_parent(cgrp), ss); css->parent = cgroup_css(cgroup_parent(cgrp), ss);
...@@ -4780,6 +4784,10 @@ static int online_css(struct cgroup_subsys_state *css) ...@@ -4780,6 +4784,10 @@ static int online_css(struct cgroup_subsys_state *css)
if (!ret) { if (!ret) {
css->flags |= CSS_ONLINE; css->flags |= CSS_ONLINE;
rcu_assign_pointer(css->cgroup->subsys[ss->id], css); rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
atomic_inc(&css->online_cnt);
if (css->parent)
atomic_inc(&css->parent->online_cnt);
} }
return ret; return ret;
} }
...@@ -5017,10 +5025,15 @@ static void css_killed_work_fn(struct work_struct *work) ...@@ -5017,10 +5025,15 @@ static void css_killed_work_fn(struct work_struct *work)
container_of(work, struct cgroup_subsys_state, destroy_work); container_of(work, struct cgroup_subsys_state, destroy_work);
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
offline_css(css);
mutex_unlock(&cgroup_mutex);
do {
offline_css(css);
css_put(css); css_put(css);
/* @css can't go away while we're holding cgroup_mutex */
css = css->parent;
} while (css && atomic_dec_and_test(&css->online_cnt));
mutex_unlock(&cgroup_mutex);
} }
/* css kill confirmation processing requires process context, bounce */ /* css kill confirmation processing requires process context, bounce */
...@@ -5029,8 +5042,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref) ...@@ -5029,8 +5042,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
struct cgroup_subsys_state *css = struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt); container_of(ref, struct cgroup_subsys_state, refcnt);
if (atomic_dec_and_test(&css->online_cnt)) {
INIT_WORK(&css->destroy_work, css_killed_work_fn); INIT_WORK(&css->destroy_work, css_killed_work_fn);
queue_work(cgroup_destroy_wq, &css->destroy_work); queue_work(cgroup_destroy_wq, &css->destroy_work);
}
} }
/** /**
......
...@@ -287,6 +287,8 @@ static struct cpuset top_cpuset = { ...@@ -287,6 +287,8 @@ static struct cpuset top_cpuset = {
static DEFINE_MUTEX(cpuset_mutex); static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_SPINLOCK(callback_lock); static DEFINE_SPINLOCK(callback_lock);
static struct workqueue_struct *cpuset_migrate_mm_wq;
/* /*
* CPU / memory hotplug is handled asynchronously. * CPU / memory hotplug is handled asynchronously.
*/ */
...@@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, ...@@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
} }
/* /*
* cpuset_migrate_mm * Migrate memory region from one set of nodes to another. This is
* * performed asynchronously as it can be called from process migration path
* Migrate memory region from one set of nodes to another. * holding locks involved in process management. All mm migrations are
* * performed in the queued order and can be waited for by flushing
* Temporarilly set tasks mems_allowed to target nodes of migration, * cpuset_migrate_mm_wq.
* so that the migration code can allocate pages on these nodes.
*
* While the mm_struct we are migrating is typically from some
* other task, the task_struct mems_allowed that we are hacking
* is for our current task, which must allocate new pages for that
* migrating memory region.
*/ */
struct cpuset_migrate_mm_work {
struct work_struct work;
struct mm_struct *mm;
nodemask_t from;
nodemask_t to;
};
static void cpuset_migrate_mm_workfn(struct work_struct *work)
{
struct cpuset_migrate_mm_work *mwork =
container_of(work, struct cpuset_migrate_mm_work, work);
/* on a wq worker, no need to worry about %current's mems_allowed */
do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
mmput(mwork->mm);
kfree(mwork);
}
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to) const nodemask_t *to)
{ {
struct task_struct *tsk = current; struct cpuset_migrate_mm_work *mwork;
tsk->mems_allowed = *to;
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
if (mwork) {
mwork->mm = mm;
mwork->from = *from;
mwork->to = *to;
INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
queue_work(cpuset_migrate_mm_wq, &mwork->work);
} else {
mmput(mm);
}
}
rcu_read_lock(); void cpuset_post_attach_flush(void)
guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); {
rcu_read_unlock(); flush_workqueue(cpuset_migrate_mm_wq);
} }
/* /*
...@@ -1097,6 +1119,7 @@ static void update_tasks_nodemask(struct cpuset *cs) ...@@ -1097,6 +1119,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
mpol_rebind_mm(mm, &cs->mems_allowed); mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate) if (migrate)
cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
else
mmput(mm); mmput(mm);
} }
css_task_iter_end(&it); css_task_iter_end(&it);
...@@ -1545,10 +1568,10 @@ static void cpuset_attach(struct cgroup_taskset *tset) ...@@ -1545,10 +1568,10 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* @old_mems_allowed is the right nodesets that we * @old_mems_allowed is the right nodesets that we
* migrate mm from. * migrate mm from.
*/ */
if (is_memory_migrate(cs)) { if (is_memory_migrate(cs))
cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to); &cpuset_attach_nodemask_to);
} else
mmput(mm); mmput(mm);
} }
} }
...@@ -1714,6 +1737,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, ...@@ -1714,6 +1737,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
mutex_unlock(&cpuset_mutex); mutex_unlock(&cpuset_mutex);
kernfs_unbreak_active_protection(of->kn); kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css); css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
return retval ?: nbytes; return retval ?: nbytes;
} }
...@@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void) ...@@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void)
top_cpuset.effective_mems = node_states[N_MEMORY]; top_cpuset.effective_mems = node_states[N_MEMORY];
register_hotmemory_notifier(&cpuset_track_online_nodes_nb); register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
} }
/** /**
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment