Merge branch 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo: - cgroup.kill is added which implements atomic killing of the whole subtree. Down the line, this should be able to replace the multiple userland implementations of "keep killing till empty". - PSI can now be turned off at boot time to avoid overhead for configurations which don't care about PSI. * 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup: make per-cgroup pressure stall tracking configurable cgroup: Fix kernel-doc cgroup: inline cgroup_task_freeze() tests/cgroup: test cgroup.kill tests/cgroup: move cg_wait_for(), cg_prepare_for_wait() tests/cgroup: use cgroup.kill in cg_killall() docs/cgroup: add entry for cgroup.kill cgroup: introduce cgroup.kill

Merge branch 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - cgroup.kill is added which implements atomic killing of the whole subtree. Down the line, this should be able to replace the multiple userland implementations of "keep killing till empty". - PSI can now be turned off at boot time to avoid overhead for configurations which don't care about PSI. * 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup: make per-cgroup pressure stall tracking configurable cgroup: Fix kernel-doc cgroup: inline cgroup_task_freeze() tests/cgroup: test cgroup.kill tests/cgroup: move cg_wait_for(), cg_prepare_for_wait() tests/cgroup: use cgroup.kill in cg_killall() docs/cgroup: add entry for cgroup.kill cgroup: introduce cgroup.kill
3dbdb38e · Linus Torvalds · e267992f · 3958e2d0 · 3dbdb38e · 3dbdb38e
Commit 3dbdb38e authored Jul 01, 2021 by Linus Torvalds
13 changed files
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -953,6 +953,21 @@ All cgroup core files are prefixed with "cgroup."
 	it's possible to delete a frozen (and empty) cgroup, as well as
 	create new sub-cgroups.
+  cgroup.kill
+	A write-only single value file which exists in non-root cgroups.
+	The only allowed value is "1".
+	Writing "1" to the file causes the cgroup and all descendant cgroups to
+	be killed. This means that all processes located in the affected cgroup
+	tree will be killed via SIGKILL.
+	Killing a cgroup tree will deal with concurrent forks appropriately and
+	is protected against migrations.
+	In a threaded cgroup, writing this file fails with EOPNOTSUPP as
+	killing cgroups is a process directed operation, i.e. it affects
+	the whole thread-group.
 Controllers
 ===========

--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -497,16 +497,21 @@
 	ccw_timeout_log	[S390]
 			See Documentation/s390/common_io.rst for details.
-	cgroup_disable=	[KNL] Disable a particular controller
+	cgroup_disable=	[KNL] Disable a particular controller or optional feature
-			Format: {name of the controller(s) to disable}
+			Format: {name of the controller(s) or feature(s) to disable}
 			The effects of cgroup_disable=foo are:
 			- foo isn't auto-mounted if you mount all cgroups in
 			  a single hierarchy
 			- foo isn't visible as an individually mountable
 			  subsystem
+			- if foo is an optional feature then the feature is
+			  disabled and corresponding cgroup files are not
+			  created
 			{Currently only "memory" controller deal with this and
 			cut the overhead, others just disable the usage. So
 			only cgroup_disable=memory is actually worthy}
+			Specifying "pressure" disables per-cgroup pressure
+			stall information accounting feature
 	cgroup_no_v1=	[KNL] Disable cgroup controllers and named hierarchies in v1
 			Format: { { controller | "all" | "named" }

--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -71,6 +71,9 @@ enum {
 	/* Cgroup is frozen. */
 	CGRP_FROZEN,
+	/* Control group has to be killed. */
+	CGRP_KILL,
 };
 /* cgroup_root->flags */
@@ -110,6 +113,7 @@ enum {
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
 	CFTYPE_DEBUG		= (1 << 5),	/* create when cgroup_debug */
+	CFTYPE_PRESSURE		= (1 << 6),	/* only if pressure feature is enabled */
 	/* internal flags, do not use outside cgroup core proper */
 	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */

--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -676,6 +676,8 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
 	return &cgrp->psi;
 }
+bool cgroup_psi_enabled(void);
 static inline void cgroup_init_kthreadd(void)
 {
 	/*
@@ -735,6 +737,11 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
 	return NULL;
 }
+static inline bool cgroup_psi_enabled(void)
+{
+	return false;
+}
 static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
 					       struct cgroup *ancestor)
 {
@@ -906,20 +913,6 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze);
 void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
 				 struct cgroup *dst);
-static inline bool cgroup_task_freeze(struct task_struct *task)
-{
-	bool ret;
-	if (task->flags & PF_KTHREAD)
-		return false;
-	rcu_read_lock();
-	ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags);
-	rcu_read_unlock();
-	return ret;
-}
 static inline bool cgroup_task_frozen(struct task_struct *task)
 {
 	return task->frozen;
@@ -929,10 +922,6 @@ static inline bool cgroup_task_frozen(struct task_struct *task)
 static inline void cgroup_enter_frozen(void) { }
 static inline void cgroup_leave_frozen(bool always_leave) { }
-static inline bool cgroup_task_freeze(struct task_struct *task)
-{
-	return false;
-}
 static inline bool cgroup_task_frozen(struct task_struct *task)
 {
 	return false;

--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -209,6 +209,22 @@ struct cgroup_namespace init_cgroup_ns = {
 static struct file_system_type cgroup2_fs_type;
 static struct cftype cgroup_base_files[];
+/* cgroup optional features */
+enum cgroup_opt_features {
+#ifdef CONFIG_PSI
+	OPT_FEATURE_PRESSURE,
+#endif
+	OPT_FEATURE_COUNT
+};
+static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
+#ifdef CONFIG_PSI
+	"pressure",
+#endif
+};
+static u16 cgroup_feature_disable_mask __read_mostly;
 static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
 static void css_task_iter_skip(struct css_task_iter *it,
@@ -2390,7 +2406,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
 }
 /**
- * cgroup_taskset_migrate - migrate a taskset
+ * cgroup_migrate_execute - migrate a taskset
 * @mgctx: migration context
 *
 * Migrate tasks in @mgctx as setup by migration preparation functions.
@@ -3632,6 +3648,18 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
 {
 	psi_trigger_replace(&of->priv, NULL);
 }
+bool cgroup_psi_enabled(void)
+{
+	return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
+}
+#else /* CONFIG_PSI */
+bool cgroup_psi_enabled(void)
+{
+	return false;
+}
 #endif /* CONFIG_PSI */
 static int cgroup_freeze_show(struct seq_file *seq, void *v)
@@ -3668,6 +3696,80 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
 	return nbytes;
 }
+static void __cgroup_kill(struct cgroup *cgrp)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+	lockdep_assert_held(&cgroup_mutex);
+	spin_lock_irq(&css_set_lock);
+	set_bit(CGRP_KILL, &cgrp->flags);
+	spin_unlock_irq(&css_set_lock);
+	css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
+	while ((task = css_task_iter_next(&it))) {
+		/* Ignore kernel threads here. */
+		if (task->flags & PF_KTHREAD)
+			continue;
+		/* Skip tasks that are already dying. */
+		if (__fatal_signal_pending(task))
+			continue;
+		send_sig(SIGKILL, task, 0);
+	}
+	css_task_iter_end(&it);
+	spin_lock_irq(&css_set_lock);
+	clear_bit(CGRP_KILL, &cgrp->flags);
+	spin_unlock_irq(&css_set_lock);
+}
+static void cgroup_kill(struct cgroup *cgrp)
+{
+	struct cgroup_subsys_state *css;
+	struct cgroup *dsct;
+	lockdep_assert_held(&cgroup_mutex);
+	cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
+		__cgroup_kill(dsct);
+}
+static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
+				 size_t nbytes, loff_t off)
+{
+	ssize_t ret = 0;
+	int kill;
+	struct cgroup *cgrp;
+	ret = kstrtoint(strstrip(buf), 0, &kill);
+	if (ret)
+		return ret;
+	if (kill != 1)
+		return -ERANGE;
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENOENT;
+	/*
+	 * Killing is a process directed operation, i.e. the whole thread-group
+	 * is taken down so act like we do for cgroup.procs and only make this
+	 * writable in non-threaded cgroups.
+	 */
+	if (cgroup_is_threaded(cgrp))
+		ret = -EOPNOTSUPP;
+	else
+		cgroup_kill(cgrp);
+	cgroup_kn_unlock(of->kn);
+	return ret ?: nbytes;
+}
 static int cgroup_file_open(struct kernfs_open_file *of)
 {
 	struct cftype *cft = of_cft(of);
@@ -3882,6 +3984,8 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 restart:
 	for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
 		/* does cft->flags tell us to skip this file on @cgrp? */
+		if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+			continue;
 		if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
 			continue;
 		if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@@ -3959,6 +4063,9 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 		WARN_ON(cft->ss || cft->kf_ops);
+		if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+			continue;
 		if (cft->seq_start)
 			kf_ops = &cgroup_kf_ops;
 		else
@@ -4860,6 +4967,11 @@ static struct cftype cgroup_base_files[] = {
 		.seq_show = cgroup_freeze_show,
 		.write = cgroup_freeze_write,
 	},
+	{
+		.name = "cgroup.kill",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write = cgroup_kill_write,
+	},
 	{
 		.name = "cpu.stat",
 		.seq_show = cpu_stat_show,
@@ -4867,6 +4979,7 @@ static struct cftype cgroup_base_files[] = {
 #ifdef CONFIG_PSI
 	{
 		.name = "io.pressure",
+		.flags = CFTYPE_PRESSURE,
 		.seq_show = cgroup_io_pressure_show,
 		.write = cgroup_io_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -4874,6 +4987,7 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "memory.pressure",
+		.flags = CFTYPE_PRESSURE,
 		.seq_show = cgroup_memory_pressure_show,
 		.write = cgroup_memory_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -4881,6 +4995,7 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "cpu.pressure",
+		.flags = CFTYPE_PRESSURE,
 		.seq_show = cgroup_cpu_pressure_show,
 		.write = cgroup_cpu_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -6080,6 +6195,8 @@ void cgroup_post_fork(struct task_struct *child,
 		      struct kernel_clone_args *kargs)
 	__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
 {
+	unsigned long cgrp_flags = 0;
+	bool kill = false;
 	struct cgroup_subsys *ss;
 	struct css_set *cset;
 	int i;
@@ -6091,6 +6208,11 @@ void cgroup_post_fork(struct task_struct *child,
 	/* init tasks are special, only link regular threads */
 	if (likely(child->pid)) {
+		if (kargs->cgrp)
+			cgrp_flags = kargs->cgrp->flags;
+		else
+			cgrp_flags = cset->dfl_cgrp->flags;
 		WARN_ON_ONCE(!list_empty(&child->cg_list));
 		cset->nr_tasks++;
 		css_set_move_task(child, NULL, cset, false);
@@ -6099,23 +6221,32 @@ void cgroup_post_fork(struct task_struct *child,
 		cset = NULL;
 	}
-	/*
+	if (!(child->flags & PF_KTHREAD)) {
-	 * If the cgroup has to be frozen, the new task has too.  Let's set
+		if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
-	 * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
+			/*
-	 * frozen state.
+			 * If the cgroup has to be frozen, the new task has
-	 */
+			 * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
-	if (unlikely(cgroup_task_freeze(child))) {
+			 * get the task into the frozen state.
-		spin_lock(&child->sighand->siglock);
+			 */
-		WARN_ON_ONCE(child->frozen);
+			spin_lock(&child->sighand->siglock);
-		child->jobctl |= JOBCTL_TRAP_FREEZE;
+			WARN_ON_ONCE(child->frozen);
-		spin_unlock(&child->sighand->siglock);
+			child->jobctl |= JOBCTL_TRAP_FREEZE;
+			spin_unlock(&child->sighand->siglock);
+			/*
+			 * Calling cgroup_update_frozen() isn't required here,
+			 * because it will be called anyway a bit later from
+			 * do_freezer_trap(). So we avoid cgroup's transient
+			 * switch from the frozen state and back.
+			 */
+		}
 		/*
-		 * Calling cgroup_update_frozen() isn't required here,
+		 * If the cgroup is to be killed notice it now and take the
-		 * because it will be called anyway a bit later from
+		 * child down right after we finished preparing it for
-		 * do_freezer_trap(). So we avoid cgroup's transient switch
+		 * userspace.
-		 * from the frozen state and back.
 		 */
+		kill = test_bit(CGRP_KILL, &cgrp_flags);
 	}
 	spin_unlock_irq(&css_set_lock);
@@ -6138,6 +6269,10 @@ void cgroup_post_fork(struct task_struct *child,
 		put_css_set(rcset);
 	}
+	/* Cgroup has to be killed so take down child immediately. */
+	if (unlikely(kill))
+		do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
 	cgroup_css_set_put_fork(kargs);
 }
@@ -6163,7 +6298,8 @@ void cgroup_exit(struct task_struct *tsk)
 	cset->nr_tasks--;
 	WARN_ON_ONCE(cgroup_task_frozen(tsk));
-	if (unlikely(cgroup_task_freeze(tsk)))
+	if (unlikely(!(tsk->flags & PF_KTHREAD) &&
+		     test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
 		cgroup_update_frozen(task_dfl_cgroup(tsk));
 	spin_unlock_irq(&css_set_lock);
@@ -6214,6 +6350,15 @@ static int __init cgroup_disable(char *str)
 			pr_info("Disabling %s control group subsystem\n",
 				ss->name);
 		}
+		for (i = 0; i < OPT_FEATURE_COUNT; i++) {
+			if (strcmp(token, cgroup_opt_feature_names[i]))
+				continue;
+			cgroup_feature_disable_mask |= 1 << i;
+			pr_info("Disabling %s control group feature\n",
+				cgroup_opt_feature_names[i]);
+			break;
+		}
 	}
 	return 1;
 }
@@ -6512,6 +6657,9 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
 		if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
 			continue;
+		if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+			continue;
 		if (prefix)
 			ret += snprintf(buf + ret, size - ret, "%s.", prefix);

--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -220,7 +220,7 @@ void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
 }
 /**
- * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
+ * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
 * @cgrp: target cgroup
 *
 * Flush stats in @cgrp's subtree and prevent further flushes.  Must be

--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -148,6 +148,7 @@
 static int psi_bug __read_mostly;
 DEFINE_STATIC_KEY_FALSE(psi_disabled);
+DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
 #ifdef CONFIG_PSI_DEFAULT_DISABLED
 static bool psi_enable;
@@ -215,6 +216,9 @@ void __init psi_init(void)
 		return;
 	}
+	if (!cgroup_psi_enabled())
+		static_branch_disable(&psi_cgroups_enabled);
 	psi_period = jiffies_to_nsecs(PSI_FREQ);
 	group_init(&psi_system);
 }
@@ -748,23 +752,23 @@ static void psi_group_change(struct psi_group *group, int cpu,
 static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
 {
+	if (*iter == &psi_system)
+		return NULL;
 #ifdef CONFIG_CGROUPS
-	struct cgroup *cgroup = NULL;
+	if (static_branch_likely(&psi_cgroups_enabled)) {
+		struct cgroup *cgroup = NULL;
-	if (!*iter)
+		if (!*iter)
-		cgroup = task->cgroups->dfl_cgrp;
+			cgroup = task->cgroups->dfl_cgrp;
-	else if (*iter == &psi_system)
+		else
-		return NULL;
+			cgroup = cgroup_parent(*iter);
-	else
-		cgroup = cgroup_parent(*iter);
-	if (cgroup && cgroup_parent(cgroup)) {
+		if (cgroup && cgroup_parent(cgroup)) {
-		*iter = cgroup;
+			*iter = cgroup;
-		return cgroup_psi(cgroup);
+			return cgroup_psi(cgroup);
+		}
 	}
-#else
-	if (*iter)
-		return NULL;
 #endif
 	*iter = &psi_system;
 	return &psi_system;

--- a/tools/testing/selftests/cgroup/.gitignore
+++ b/tools/testing/selftests/cgroup/.gitignore
@@ -2,4 +2,5 @@
 test_memcontrol
 test_core
 test_freezer
 test_kmem
\ No newline at end of file
+test_kill
--- a/tools/testing/selftests/cgroup/Makefile
+++ b/tools/testing/selftests/cgroup/Makefile
@@ -9,6 +9,7 @@ TEST_GEN_PROGS = test_memcontrol
 TEST_GEN_PROGS += test_kmem
 TEST_GEN_PROGS += test_core
 TEST_GEN_PROGS += test_freezer
+TEST_GEN_PROGS += test_kill
 include ../lib.mk
@@ -16,3 +17,4 @@ $(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h
 $(OUTPUT)/test_kmem: cgroup_util.c ../clone3/clone3_selftests.h
 $(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h
 $(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h
+$(OUTPUT)/test_kill: cgroup_util.c ../clone3/clone3_selftests.h ../pidfd/pidfd.h
--- a/tools/testing/selftests/cgroup/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/cgroup_util.c
@@ -5,10 +5,12 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <linux/limits.h>
+#include <poll.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/inotify.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/wait.h>
@@ -252,6 +254,10 @@ int cg_killall(const char *cgroup)
 	char buf[PAGE_SIZE];
 	char *ptr = buf;
+	/* If cgroup.kill exists use it. */
+	if (!cg_write(cgroup, "cgroup.kill", "1"))
+		return 0;
 	if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
 		return -1;
@@ -576,3 +582,48 @@ int clone_into_cgroup_run_wait(const char *cgroup)
 	(void)clone_reap(pid, WEXITED);
 	return 0;
 }
+int cg_prepare_for_wait(const char *cgroup)
+{
+	int fd, ret = -1;
+	fd = inotify_init1(0);
+	if (fd == -1)
+		return fd;
+	ret = inotify_add_watch(fd, cg_control(cgroup, "cgroup.events"),
+				IN_MODIFY);
+	if (ret == -1) {
+		close(fd);
+		fd = -1;
+	}
+	return fd;
+}
+int cg_wait_for(int fd)
+{
+	int ret = -1;
+	struct pollfd fds = {
+		.fd = fd,
+		.events = POLLIN,
+	};
+	while (true) {
+		ret = poll(&fds, 1, 10000);
+		if (ret == -1) {
+			if (errno == EINTR)
+				continue;
+			break;
+		}
+		if (ret > 0 && fds.revents & POLLIN) {
+			ret = 0;
+			break;
+		}
+	}
+	return ret;
+}
--- a/tools/testing/selftests/cgroup/cgroup_util.h
+++ b/tools/testing/selftests/cgroup/cgroup_util.h
@@ -54,3 +54,5 @@ extern pid_t clone_into_cgroup(int cgroup_fd);
 extern int clone_reap(pid_t pid, int options);
 extern int clone_into_cgroup_run_wait(const char *cgroup);
 extern int dirfd_open_opath(const char *dir);
+extern int cg_prepare_for_wait(const char *cgroup);
+extern int cg_wait_for(int fd);
--- a/tools/testing/selftests/cgroup/test_freezer.c
+++ b/tools/testing/selftests/cgroup/test_freezer.c
@@ -7,9 +7,7 @@
 #include <unistd.h>
 #include <stdio.h>
 #include <errno.h>
-#include <poll.h>
 #include <stdlib.h>
-#include <sys/inotify.h>
 #include <string.h>
 #include <sys/wait.h>
@@ -54,61 +52,6 @@ static int cg_freeze_nowait(const char *cgroup, bool freeze)
 	return cg_write(cgroup, "cgroup.freeze", freeze ? "1" : "0");
 }
-/*
- * Prepare for waiting on cgroup.events file.
- */
-static int cg_prepare_for_wait(const char *cgroup)
-{
-	int fd, ret = -1;
-	fd = inotify_init1(0);
-	if (fd == -1) {
-		debug("Error: inotify_init1() failed\n");
-		return fd;
-	}
-	ret = inotify_add_watch(fd, cg_control(cgroup, "cgroup.events"),
-				IN_MODIFY);
-	if (ret == -1) {
-		debug("Error: inotify_add_watch() failed\n");
-		close(fd);
-		fd = -1;
-	}
-	return fd;
-}
-/*
- * Wait for an event. If there are no events for 10 seconds,
- * treat this an error.
- */
-static int cg_wait_for(int fd)
-{
-	int ret = -1;
-	struct pollfd fds = {
-		.fd = fd,
-		.events = POLLIN,
-	};
-	while (true) {
-		ret = poll(&fds, 1, 10000);
-		if (ret == -1) {
-			if (errno == EINTR)
-				continue;
-			debug("Error: poll() failed\n");
-			break;
-		}
-		if (ret > 0 && fds.revents & POLLIN) {
-			ret = 0;
-			break;
-		}
-	}
-	return ret;
-}
 /*
 * Attach a task to the given cgroup and wait for a cgroup frozen event.
 * All transient events (e.g. populated) are ignored.

--- a/tools/testing/selftests/cgroup/test_kill.c
+++ b/tools/testing/selftests/cgroup/test_kill.c
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <errno.h>
+#include <linux/limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "../kselftest.h"
+#include "../pidfd/pidfd.h"
+#include "cgroup_util.h"
+/*
+ * Kill the given cgroup and wait for the inotify signal.
+ * If there are no events in 10 seconds, treat this as an error.
+ * Then check that the cgroup is in the desired state.
+ */
+static int cg_kill_wait(const char *cgroup)
+{
+	int fd, ret = -1;
+	fd = cg_prepare_for_wait(cgroup);
+	if (fd < 0)
+		return fd;
+	ret = cg_write(cgroup, "cgroup.kill", "1");
+	if (ret)
+		goto out;
+	ret = cg_wait_for(fd);
+	if (ret)
+		goto out;
+out:
+	close(fd);
+	return ret;
+}
+/*
+ * A simple process running in a sleep loop until being
+ * re-parented.
+ */
+static int child_fn(const char *cgroup, void *arg)
+{
+	int ppid = getppid();
+	while (getppid() == ppid)
+		usleep(1000);
+	return getppid() == ppid;
+}
+static int test_cgkill_simple(const char *root)
+{
+	pid_t pids[100];
+	int ret = KSFT_FAIL;
+	char *cgroup = NULL;
+	int i;
+	cgroup = cg_name(root, "cg_test_simple");
+	if (!cgroup)
+		goto cleanup;
+	if (cg_create(cgroup))
+		goto cleanup;
+	for (i = 0; i < 100; i++)
+		pids[i] = cg_run_nowait(cgroup, child_fn, NULL);
+	if (cg_wait_for_proc_count(cgroup, 100))
+		goto cleanup;
+	if (cg_read_strcmp(cgroup, "cgroup.events", "populated 1\n"))
+		goto cleanup;
+	if (cg_kill_wait(cgroup))
+		goto cleanup;
+	ret = KSFT_PASS;
+cleanup:
+	for (i = 0; i < 100; i++)
+		wait_for_pid(pids[i]);
+	if (ret == KSFT_PASS &&
+	    cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n"))
+		ret = KSFT_FAIL;
+	if (cgroup)
+		cg_destroy(cgroup);
+	free(cgroup);
+	return ret;
+}
+/*
+ * The test creates the following hierarchy:
+ *       A
+ *    / / \ \
+ *   B  E  I K
+ *  /\  |
+ * C  D F
+ *      |
+ *      G
+ *      |
+ *      H
+ *
+ * with a process in C, H and 3 processes in K.
+ * Then it tries to kill the whole tree.
+ */
+static int test_cgkill_tree(const char *root)
+{
+	pid_t pids[5];
+	char *cgroup[10] = {0};
+	int ret = KSFT_FAIL;
+	int i;
+	cgroup[0] = cg_name(root, "cg_test_tree_A");
+	if (!cgroup[0])
+		goto cleanup;
+	cgroup[1] = cg_name(cgroup[0], "B");
+	if (!cgroup[1])
+		goto cleanup;
+	cgroup[2] = cg_name(cgroup[1], "C");
+	if (!cgroup[2])
+		goto cleanup;
+	cgroup[3] = cg_name(cgroup[1], "D");
+	if (!cgroup[3])
+		goto cleanup;
+	cgroup[4] = cg_name(cgroup[0], "E");
+	if (!cgroup[4])
+		goto cleanup;
+	cgroup[5] = cg_name(cgroup[4], "F");
+	if (!cgroup[5])
+		goto cleanup;
+	cgroup[6] = cg_name(cgroup[5], "G");
+	if (!cgroup[6])
+		goto cleanup;
+	cgroup[7] = cg_name(cgroup[6], "H");
+	if (!cgroup[7])
+		goto cleanup;
+	cgroup[8] = cg_name(cgroup[0], "I");
+	if (!cgroup[8])
+		goto cleanup;
+	cgroup[9] = cg_name(cgroup[0], "K");
+	if (!cgroup[9])
+		goto cleanup;
+	for (i = 0; i < 10; i++)
+		if (cg_create(cgroup[i]))
+			goto cleanup;
+	pids[0] = cg_run_nowait(cgroup[2], child_fn, NULL);
+	pids[1] = cg_run_nowait(cgroup[7], child_fn, NULL);
+	pids[2] = cg_run_nowait(cgroup[9], child_fn, NULL);
+	pids[3] = cg_run_nowait(cgroup[9], child_fn, NULL);
+	pids[4] = cg_run_nowait(cgroup[9], child_fn, NULL);
+	/*
+	 * Wait until all child processes will enter
+	 * corresponding cgroups.
+	 */
+	if (cg_wait_for_proc_count(cgroup[2], 1) ||
+	    cg_wait_for_proc_count(cgroup[7], 1) ||
+	    cg_wait_for_proc_count(cgroup[9], 3))
+		goto cleanup;
+	/*
+	 * Kill A and check that we get an empty notification.
+	 */
+	if (cg_kill_wait(cgroup[0]))
+		goto cleanup;
+	ret = KSFT_PASS;
+cleanup:
+	for (i = 0; i < 5; i++)
+		wait_for_pid(pids[i]);
+	if (ret == KSFT_PASS &&
+	    cg_read_strcmp(cgroup[0], "cgroup.events", "populated 0\n"))
+		ret = KSFT_FAIL;
+	for (i = 9; i >= 0 && cgroup[i]; i--) {
+		cg_destroy(cgroup[i]);
+		free(cgroup[i]);
+	}
+	return ret;
+}
+static int forkbomb_fn(const char *cgroup, void *arg)
+{
+	int ppid;
+	fork();
+	fork();
+	ppid = getppid();
+	while (getppid() == ppid)
+		usleep(1000);
+	return getppid() == ppid;
+}
+/*
+ * The test runs a fork bomb in a cgroup and tries to kill it.
+ */
+static int test_cgkill_forkbomb(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cgroup = NULL;
+	pid_t pid = -ESRCH;
+	cgroup = cg_name(root, "cg_forkbomb_test");
+	if (!cgroup)
+		goto cleanup;
+	if (cg_create(cgroup))
+		goto cleanup;
+	pid = cg_run_nowait(cgroup, forkbomb_fn, NULL);
+	if (pid < 0)
+		goto cleanup;
+	usleep(100000);
+	if (cg_kill_wait(cgroup))
+		goto cleanup;
+	if (cg_wait_for_proc_count(cgroup, 0))
+		goto cleanup;
+	ret = KSFT_PASS;
+cleanup:
+	if (pid > 0)
+		wait_for_pid(pid);
+	if (ret == KSFT_PASS &&
+	    cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n"))
+		ret = KSFT_FAIL;
+	if (cgroup)
+		cg_destroy(cgroup);
+	free(cgroup);
+	return ret;
+}
+#define T(x) { x, #x }
+struct cgkill_test {
+	int (*fn)(const char *root);
+	const char *name;
+} tests[] = {
+	T(test_cgkill_simple),
+	T(test_cgkill_tree),
+	T(test_cgkill_forkbomb),
+};
+#undef T
+int main(int argc, char *argv[])
+{
+	char root[PATH_MAX];
+	int i, ret = EXIT_SUCCESS;
+	if (cg_find_unified_root(root, sizeof(root)))
+		ksft_exit_skip("cgroup v2 isn't mounted\n");
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		switch (tests[i].fn(root)) {
+		case KSFT_PASS:
+			ksft_test_result_pass("%s\n", tests[i].name);
+			break;
+		case KSFT_SKIP:
+			ksft_test_result_skip("%s\n", tests[i].name);
+			break;
+		default:
+			ret = EXIT_FAILURE;
+			ksft_test_result_fail("%s\n", tests[i].name);
+			break;
+		}
+	}
+	return ret;
+}