Commit 22714a2b authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:
 "Cgroup2 cpu controller support is finally merged.

   - Basic cpu statistics support to allow monitoring by default without
     the CPU controller enabled.

   - cgroup2 cpu controller support.

   - /sys/kernel/cgroup files to help dealing with new / optional
     features"

* 'for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup: export list of cgroups v2 features using sysfs
  cgroup: export list of delegatable control files using sysfs
  cgroup: mark @cgrp __maybe_unused in cpu_stat_show()
  MAINTAINERS: relocate cpuset.c
  cgroup, sched: Move basic cpu stats from cgroup.stat to cpu.stat
  sched: Implement interface for cgroup unified hierarchy
  sched: Misc preps for cgroup unified hierarchy interface
  sched/cputime: Add dummy cputime_adjust() implementation for CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
  cgroup: statically initialize init_css_set->dfl_cgrp
  cgroup: Implement cgroup2 basic CPU usage accounting
  cpuacct: Introduce cgroup_account_cputime[_field]()
  sched/cputime: Expose cputime_adjust()
parents 766ec76a 5f2e6734
......@@ -893,10 +893,6 @@ Controllers
CPU
---
.. note::
The interface for the cpu controller hasn't been merged yet
The "cpu" controllers regulates distribution of CPU cycles. This
controller implements weight and absolute bandwidth limit models for
normal scheduling policy and absolute bandwidth allocation model for
......@@ -910,12 +906,16 @@ All time durations are in microseconds.
cpu.stat
A read-only flat-keyed file which exists on non-root cgroups.
This file exists whether the controller is enabled or not.
It reports the following six stats:
It always reports the following three stats:
- usage_usec
- user_usec
- system_usec
and the following three when the controller is enabled:
- nr_periods
- nr_throttled
- throttled_usec
......@@ -926,6 +926,18 @@ All time durations are in microseconds.
The weight in the range [1, 10000].
cpu.weight.nice
A read-write single value file which exists on non-root
cgroups. The default is "0".
The nice value is in the range [-20, 19].
This interface file is an alternative interface for
"cpu.weight" and allows reading and setting weight using the
same values used by nice(2). Because the range is smaller and
granularity is coarser for the nice values, the read value is
the closest approximation of the current weight.
cpu.max
A read-write two value file which exists on non-root cgroups.
The default is "max 100000".
......@@ -938,26 +950,6 @@ All time durations are in microseconds.
$PERIOD duration. "max" for $MAX indicates no limit. If only
one number is written, $MAX is updated.
cpu.rt.max
.. note::
The semantics of this file is still under discussion and the
interface hasn't been merged yet
A read-write two value file which exists on all cgroups.
The default is "0 100000".
The maximum realtime runtime allocation. Over-committing
configurations are disallowed and process migrations are
rejected if not enough bandwidth is available. It's in the
following format::
$MAX $PERIOD
which indicates that the group may consume upto $MAX in each
$PERIOD duration. If only one number is written, $MAX is
updated.
Memory
------
......
......@@ -3592,7 +3592,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
S: Maintained
F: Documentation/cgroup-v1/cpusets.txt
F: include/linux/cpuset.h
F: kernel/cpuset.c
F: kernel/cgroup/cpuset.c
CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG)
M: Johannes Weiner <hannes@cmpxchg.org>
......
......@@ -17,6 +17,7 @@
#include <linux/refcount.h>
#include <linux/percpu-refcount.h>
#include <linux/percpu-rwsem.h>
#include <linux/u64_stats_sync.h>
#include <linux/workqueue.h>
#include <linux/bpf-cgroup.h>
......@@ -255,6 +256,57 @@ struct css_set {
struct rcu_head rcu_head;
};
/*
* cgroup basic resource usage statistics. Accounting is done per-cpu in
* cgroup_cpu_stat which is then lazily propagated up the hierarchy on
* reads.
*
* When a stat gets updated, the cgroup_cpu_stat and its ancestors are
* linked into the updated tree. On the following read, propagation only
* considers and consumes the updated tree. This makes reading O(the
* number of descendants which have been active since last read) instead of
* O(the total number of descendants).
*
* This is important because there can be a lot of (draining) cgroups which
* aren't active and stat may be read frequently. The combination can
* become very expensive. By propagating selectively, increasing reading
* frequency decreases the cost of each read.
*/
struct cgroup_cpu_stat {
/*
* ->sync protects all the current counters. These are the only
* fields which get updated in the hot path.
*/
struct u64_stats_sync sync;
struct task_cputime cputime;
/*
* Snapshots at the last reading. These are used to calculate the
* deltas to propagate to the global counters.
*/
struct task_cputime last_cputime;
/*
* Child cgroups with stat updates on this cpu since the last read
* are linked on the parent's ->updated_children through
* ->updated_next.
*
* In addition to being more compact, singly-linked list pointing
* to the cgroup makes it unnecessary for each per-cpu struct to
* point back to the associated cgroup.
*
* Protected by per-cpu cgroup_cpu_stat_lock.
*/
struct cgroup *updated_children; /* terminated by self cgroup */
struct cgroup *updated_next; /* NULL iff not on the list */
};
struct cgroup_stat {
/* per-cpu statistics are collected into the folowing global counters */
struct task_cputime cputime;
struct prev_cputime prev_cputime;
};
struct cgroup {
/* self css with NULL ->ss, points back to this cgroup */
struct cgroup_subsys_state self;
......@@ -354,6 +406,11 @@ struct cgroup {
*/
struct cgroup *dom_cgrp;
/* cgroup basic resource statistics */
struct cgroup_cpu_stat __percpu *cpu_stat;
struct cgroup_stat pending_stat; /* pending from children */
struct cgroup_stat stat;
/*
* list of pidlists, up to two for each namespace (one for procs, one
* for tasks); created on demand.
......@@ -513,6 +570,8 @@ struct cgroup_subsys {
void (*css_released)(struct cgroup_subsys_state *css);
void (*css_free)(struct cgroup_subsys_state *css);
void (*css_reset)(struct cgroup_subsys_state *css);
int (*css_extra_stat_show)(struct seq_file *seq,
struct cgroup_subsys_state *css);
int (*can_attach)(struct cgroup_taskset *tset);
void (*cancel_attach)(struct cgroup_taskset *tset);
......
......@@ -23,6 +23,7 @@
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
#include <linux/kernel_stat.h>
#include <linux/cgroup-defs.h>
......@@ -689,6 +690,63 @@ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
char *buf, size_t buflen) {}
#endif /* !CONFIG_CGROUPS */
/*
* Basic resource stats.
*/
#ifdef CONFIG_CGROUPS
#ifdef CONFIG_CGROUP_CPUACCT
void cpuacct_charge(struct task_struct *tsk, u64 cputime);
void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
static inline void cpuacct_account_field(struct task_struct *tsk, int index,
u64 val) {}
#endif
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
void __cgroup_account_cputime_field(struct cgroup *cgrp,
enum cpu_usage_stat index, u64 delta_exec);
static inline void cgroup_account_cputime(struct task_struct *task,
u64 delta_exec)
{
struct cgroup *cgrp;
cpuacct_charge(task, delta_exec);
rcu_read_lock();
cgrp = task_dfl_cgroup(task);
if (cgroup_parent(cgrp))
__cgroup_account_cputime(cgrp, delta_exec);
rcu_read_unlock();
}
static inline void cgroup_account_cputime_field(struct task_struct *task,
enum cpu_usage_stat index,
u64 delta_exec)
{
struct cgroup *cgrp;
cpuacct_account_field(task, index, delta_exec);
rcu_read_lock();
cgrp = task_dfl_cgroup(task);
if (cgroup_parent(cgrp))
__cgroup_account_cputime_field(cgrp, index, delta_exec);
rcu_read_unlock();
}
#else /* CONFIG_CGROUPS */
static inline void cgroup_account_cputime(struct task_struct *task,
u64 delta_exec) {}
static inline void cgroup_account_cputime_field(struct task_struct *task,
enum cpu_usage_stat index,
u64 delta_exec) {}
#endif /* CONFIG_CGROUPS */
/*
* sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
* definition in cgroup-defs.h.
......
......@@ -54,7 +54,8 @@ static inline void task_cputime_scaled(struct task_struct *t,
extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
extern void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
u64 *ut, u64 *st);
/*
* Thread group CPU time accounting.
......
# SPDX-License-Identifier: GPL-2.0
obj-y := cgroup.o namespace.o cgroup-v1.o
obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
......
......@@ -200,6 +200,15 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
int cgroup_task_count(const struct cgroup *cgrp);
/*
* stat.c
*/
void cgroup_stat_flush(struct cgroup *cgrp);
int cgroup_stat_init(struct cgroup *cgrp);
void cgroup_stat_exit(struct cgroup *cgrp);
void cgroup_stat_show_cputime(struct seq_file *seq);
void cgroup_stat_boot(void);
/*
* namespace.c
*/
......
......@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
};
#undef SUBSYS
static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
/*
* The default hierarchy, reserved for the subsystems that are otherwise
* unattached - it never has more than a single cgroup, and all tasks are
* part of that cgroup.
*/
struct cgroup_root cgrp_dfl_root;
struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
......@@ -461,6 +463,28 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
return &cgrp->self;
}
/**
* cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
*
* Find and get @cgrp's css assocaited with @ss. If the css doesn't exist
* or is offline, %NULL is returned.
*/
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
rcu_read_lock();
css = cgroup_css(cgrp, ss);
if (!css || !css_tryget_online(css))
css = NULL;
rcu_read_unlock();
return css;
}
/**
* cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
......@@ -647,6 +671,14 @@ struct css_set init_css_set = {
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
/*
* The following field is re-initialized when this cset gets linked
* in cgroup_init(). However, let's initialize the field
* statically too so that the default cgroup can be accessed safely
* early during boot.
*/
.dfl_cgrp = &cgrp_dfl_root.cgrp,
};
static int css_set_count = 1; /* 1 for init_css_set */
......@@ -3315,6 +3347,37 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
return 0;
}
static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
struct cgroup *cgrp, int ssid)
{
struct cgroup_subsys *ss = cgroup_subsys[ssid];
struct cgroup_subsys_state *css;
int ret;
if (!ss->css_extra_stat_show)
return 0;
css = cgroup_tryget_css(cgrp, ss);
if (!css)
return 0;
ret = ss->css_extra_stat_show(seq, css);
css_put(css);
return ret;
}
static int cpu_stat_show(struct seq_file *seq, void *v)
{
struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
int ret = 0;
cgroup_stat_show_cputime(seq);
#ifdef CONFIG_CGROUP_SCHED
ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
#endif
return ret;
}
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
......@@ -4422,6 +4485,11 @@ static struct cftype cgroup_base_files[] = {
.name = "cgroup.stat",
.seq_show = cgroup_stat_show,
},
{
.name = "cpu.stat",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_stat_show,
},
{ } /* terminate */
};
......@@ -4482,6 +4550,8 @@ static void css_free_work_fn(struct work_struct *work)
*/
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
if (cgroup_on_dfl(cgrp))
cgroup_stat_exit(cgrp);
kfree(cgrp);
} else {
/*
......@@ -4526,6 +4596,9 @@ static void css_release_work_fn(struct work_struct *work)
/* cgroup release path */
trace_cgroup_release(cgrp);
if (cgroup_on_dfl(cgrp))
cgroup_stat_flush(cgrp);
for (tcgrp = cgroup_parent(cgrp); tcgrp;
tcgrp = cgroup_parent(tcgrp))
tcgrp->nr_dying_descendants--;
......@@ -4709,6 +4782,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (ret)
goto out_free_cgrp;
if (cgroup_on_dfl(parent)) {
ret = cgroup_stat_init(cgrp);
if (ret)
goto out_cancel_ref;
}
/*
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
......@@ -4716,7 +4795,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
if (cgrp->id < 0) {
ret = -ENOMEM;
goto out_cancel_ref;
goto out_stat_exit;
}
init_cgroup_housekeeping(cgrp);
......@@ -4767,6 +4846,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
out_idr_free:
cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
out_stat_exit:
if (cgroup_on_dfl(parent))
cgroup_stat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
......@@ -5161,6 +5243,8 @@ int __init cgroup_init(void)
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
cgroup_stat_boot();
/*
* The latency of the synchronize_sched() is too high for cgroups,
* avoid it at the cost of forcing all readers into the slow path.
......@@ -5780,3 +5864,72 @@ int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
return ret;
}
#endif /* CONFIG_CGROUP_BPF */
#ifdef CONFIG_SYSFS
static ssize_t show_delegatable_files(struct cftype *files, char *buf,
ssize_t size, const char *prefix)
{
struct cftype *cft;
ssize_t ret = 0;
for (cft = files; cft && cft->name[0] != '\0'; cft++) {
if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
continue;
if (prefix)
ret += snprintf(buf + ret, size - ret, "%s.", prefix);
ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
if (unlikely(ret >= size)) {
WARN_ON(1);
break;
}
}
return ret;
}
static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
struct cgroup_subsys *ss;
int ssid;
ssize_t ret = 0;
ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
NULL);
for_each_subsys(ss, ssid)
ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
PAGE_SIZE - ret,
cgroup_subsys_name[ssid]);
return ret;
}
static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
static struct attribute *cgroup_sysfs_attrs[] = {
&cgroup_delegate_attr.attr,
&cgroup_features_attr.attr,
NULL,
};
static const struct attribute_group cgroup_sysfs_attr_group = {
.attrs = cgroup_sysfs_attrs,
.name = "cgroup",
};
static int __init cgroup_sysfs_init(void)
{
return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
}
subsys_initcall(cgroup_sysfs_init);
#endif /* CONFIG_SYSFS */
#include "cgroup-internal.h"
#include <linux/sched/cputime.h>
static DEFINE_MUTEX(cgroup_stat_mutex);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
{
return per_cpu_ptr(cgrp->cpu_stat, cpu);
}
/**
* cgroup_cpu_stat_updated - keep track of updated cpu_stat
* @cgrp: target cgroup
* @cpu: cpu on which cpu_stat was updated
*
* @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching
* cpu_stat->updated_children list. See the comment on top of
* cgroup_cpu_stat definition for details.
*/
static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
{
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
struct cgroup *parent;
unsigned long flags;
/*
* Speculative already-on-list test. This may race leading to
* temporary inaccuracies, which is fine.
*
* Because @parent's updated_children is terminated with @parent
* instead of NULL, we can tell whether @cgrp is on the list by
* testing the next pointer for NULL.
*/
if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
return;
raw_spin_lock_irqsave(cpu_lock, flags);
/* put @cgrp and all ancestors on the corresponding updated lists */
for (parent = cgroup_parent(cgrp); parent;
cgrp = parent, parent = cgroup_parent(cgrp)) {
struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
/*
* Both additions and removals are bottom-up. If a cgroup
* is already in the tree, all ancestors are.
*/
if (cstat->updated_next)
break;
cstat->updated_next = pcstat->updated_children;
pcstat->updated_children = cgrp;
}
raw_spin_unlock_irqrestore(cpu_lock, flags);
}
/**
* cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
* @pos: current position
* @root: root of the tree to traversal
* @cpu: target cpu
*
* Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts
* the traversal and %NULL return indicates the end. During traversal,
* each returned cgroup is unlinked from the tree. Must be called with the
* matching cgroup_cpu_stat_lock held.
*
* The only ordering guarantee is that, for a parent and a child pair
* covered by a given traversal, if a child is visited, its parent is
* guaranteed to be visited afterwards.
*/
static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
struct cgroup *root, int cpu)
{
struct cgroup_cpu_stat *cstat;
struct cgroup *parent;
if (pos == root)
return NULL;
/*
* We're gonna walk down to the first leaf and visit/remove it. We
* can pick whatever unvisited node as the starting point.
*/
if (!pos)
pos = root;
else
pos = cgroup_parent(pos);
/* walk down to the first leaf */
while (true) {
cstat = cgroup_cpu_stat(pos, cpu);
if (cstat->updated_children == pos)
break;
pos = cstat->updated_children;
}
/*
* Unlink @pos from the tree. As the updated_children list is
* singly linked, we have to walk it to find the removal point.
* However, due to the way we traverse, @pos will be the first
* child in most cases. The only exception is @root.
*/
parent = cgroup_parent(pos);
if (parent && cstat->updated_next) {
struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
struct cgroup_cpu_stat *ncstat;
struct cgroup **nextp;
nextp = &pcstat->updated_children;
while (true) {
ncstat = cgroup_cpu_stat(*nextp, cpu);
if (*nextp == pos)
break;
WARN_ON_ONCE(*nextp == parent);
nextp = &ncstat->updated_next;
}
*nextp = cstat->updated_next;
cstat->updated_next = NULL;
}
return pos;
}
static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
struct cgroup_stat *src_stat)
{
dst_stat->cputime.utime += src_stat->cputime.utime;
dst_stat->cputime.stime += src_stat->cputime.stime;
dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
}
static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
{
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
struct task_cputime *last_cputime = &cstat->last_cputime;
struct task_cputime cputime;
struct cgroup_stat delta;
unsigned seq;
lockdep_assert_held(&cgroup_stat_mutex);
/* fetch the current per-cpu values */
do {
seq = __u64_stats_fetch_begin(&cstat->sync);
cputime = cstat->cputime;
} while (__u64_stats_fetch_retry(&cstat->sync, seq));
/* accumulate the deltas to propgate */
delta.cputime.utime = cputime.utime - last_cputime->utime;
delta.cputime.stime = cputime.stime - last_cputime->stime;
delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
last_cputime->sum_exec_runtime;
*last_cputime = cputime;
/* transfer the pending stat into delta */
cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
/* propagate delta into the global stat and the parent's pending */
cgroup_stat_accumulate(&cgrp->stat, &delta);
if (parent)
cgroup_stat_accumulate(&parent->pending_stat, &delta);
}
/* see cgroup_stat_flush() */
static void cgroup_stat_flush_locked(struct cgroup *cgrp)
{
int cpu;
lockdep_assert_held(&cgroup_stat_mutex);
for_each_possible_cpu(cpu) {
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
struct cgroup *pos = NULL;
raw_spin_lock_irq(cpu_lock);
while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
cgroup_cpu_stat_flush_one(pos, cpu);
raw_spin_unlock_irq(cpu_lock);
}
}
/**
* cgroup_stat_flush - flush stats in @cgrp's subtree
* @cgrp: target cgroup
*
* Collect all per-cpu stats in @cgrp's subtree into the global counters
* and propagate them upwards. After this function returns, all cgroups in
* the subtree have up-to-date ->stat.
*
* This also gets all cgroups in the subtree including @cgrp off the
* ->updated_children lists.
*/
void cgroup_stat_flush(struct cgroup *cgrp)
{
mutex_lock(&cgroup_stat_mutex);
cgroup_stat_flush_locked(cgrp);
mutex_unlock(&cgroup_stat_mutex);
}
static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
{
struct cgroup_cpu_stat *cstat;
cstat = get_cpu_ptr(cgrp->cpu_stat);
u64_stats_update_begin(&cstat->sync);
return cstat;
}
static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
struct cgroup_cpu_stat *cstat)
{
u64_stats_update_end(&cstat->sync);
cgroup_cpu_stat_updated(cgrp, smp_processor_id());
put_cpu_ptr(cstat);
}
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
struct cgroup_cpu_stat *cstat;
cstat = cgroup_cpu_stat_account_begin(cgrp);
cstat->cputime.sum_exec_runtime += delta_exec;
cgroup_cpu_stat_account_end(cgrp, cstat);
}
void __cgroup_account_cputime_field(struct cgroup *cgrp,
enum cpu_usage_stat index, u64 delta_exec)
{
struct cgroup_cpu_stat *cstat;
cstat = cgroup_cpu_stat_account_begin(cgrp);
switch (index) {
case CPUTIME_USER:
case CPUTIME_NICE:
cstat->cputime.utime += delta_exec;
break;
case CPUTIME_SYSTEM:
case CPUTIME_IRQ:
case CPUTIME_SOFTIRQ:
cstat->cputime.stime += delta_exec;
break;
default:
break;
}
cgroup_cpu_stat_account_end(cgrp, cstat);
}
void cgroup_stat_show_cputime(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime;
if (!cgroup_parent(cgrp))
return;
mutex_lock(&cgroup_stat_mutex);
cgroup_stat_flush_locked(cgrp);
usage = cgrp->stat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
&utime, &stime);
mutex_unlock(&cgroup_stat_mutex);
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC);
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n",
usage, utime, stime);
}
int cgroup_stat_init(struct cgroup *cgrp)
{
int cpu;
/* the root cgrp has cpu_stat preallocated */
if (!cgrp->cpu_stat) {
cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
if (!cgrp->cpu_stat)
return -ENOMEM;
}
/* ->updated_children list is self terminated */
for_each_possible_cpu(cpu)
cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
prev_cputime_init(&cgrp->stat.prev_cputime);
return 0;
}
void cgroup_stat_exit(struct cgroup *cgrp)
{
int cpu;
cgroup_stat_flush(cgrp);
/* sanity check */
for_each_possible_cpu(cpu) {
struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
WARN_ON_ONCE(cstat->updated_next))
return;
}
free_percpu(cgrp->cpu_stat);
cgrp->cpu_stat = NULL;
}
void __init cgroup_stat_boot(void)
{
int cpu;
for_each_possible_cpu(cpu)
raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
}
......@@ -6620,7 +6620,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
return ret;
}
static int cpu_stats_show(struct seq_file *sf, void *v)
static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
......@@ -6660,7 +6660,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
static struct cftype cpu_files[] = {
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
.name = "shares",
......@@ -6681,7 +6681,7 @@ static struct cftype cpu_files[] = {
},
{
.name = "stat",
.seq_show = cpu_stats_show,
.seq_show = cpu_cfs_stat_show,
},
#endif
#ifdef CONFIG_RT_GROUP_SCHED
......@@ -6699,16 +6699,182 @@ static struct cftype cpu_files[] = {
{ } /* Terminate */
};
static int cpu_extra_stat_show(struct seq_file *sf,
struct cgroup_subsys_state *css)
{
#ifdef CONFIG_CFS_BANDWIDTH
{
struct task_group *tg = css_tg(css);
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
u64 throttled_usec;
throttled_usec = cfs_b->throttled_time;
do_div(throttled_usec, NSEC_PER_USEC);
seq_printf(sf, "nr_periods %d\n"
"nr_throttled %d\n"
"throttled_usec %llu\n",
cfs_b->nr_periods, cfs_b->nr_throttled,
throttled_usec);
}
#endif
return 0;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct task_group *tg = css_tg(css);
u64 weight = scale_load_down(tg->shares);
return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
}
static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
struct cftype *cft, u64 weight)
{
/*
* cgroup weight knobs should use the common MIN, DFL and MAX
* values which are 1, 100 and 10000 respectively. While it loses
* a bit of range on both ends, it maps pretty well onto the shares
* value used by scheduler and the round-trip conversions preserve
* the original value over the entire range.
*/
if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
return -ERANGE;
weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
return sched_group_set_shares(css_tg(css), scale_load(weight));
}
static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
unsigned long weight = scale_load_down(css_tg(css)->shares);
int last_delta = INT_MAX;
int prio, delta;
/* find the closest nice value to the current weight */
for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
delta = abs(sched_prio_to_weight[prio] - weight);
if (delta >= last_delta)
break;
last_delta = delta;
}
return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
}
static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 nice)
{
unsigned long weight;
if (nice < MIN_NICE || nice > MAX_NICE)
return -ERANGE;
weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO];
return sched_group_set_shares(css_tg(css), scale_load(weight));
}
#endif
static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
long period, long quota)
{
if (quota < 0)
seq_puts(sf, "max");
else
seq_printf(sf, "%ld", quota);
seq_printf(sf, " %ld\n", period);
}
/* caller should put the current value in *@periodp before calling */
static int __maybe_unused cpu_period_quota_parse(char *buf,
u64 *periodp, u64 *quotap)
{
char tok[21]; /* U64_MAX */
if (!sscanf(buf, "%s %llu", tok, periodp))
return -EINVAL;
*periodp *= NSEC_PER_USEC;
if (sscanf(tok, "%llu", quotap))
*quotap *= NSEC_PER_USEC;
else if (!strcmp(tok, "max"))
*quotap = RUNTIME_INF;
else
return -EINVAL;
return 0;
}
#ifdef CONFIG_CFS_BANDWIDTH
static int cpu_max_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
return 0;
}
static ssize_t cpu_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct task_group *tg = css_tg(of_css(of));
u64 period = tg_get_cfs_period(tg);
u64 quota;
int ret;
ret = cpu_period_quota_parse(buf, &period, &quota);
if (!ret)
ret = tg_set_cfs_bandwidth(tg, period, quota);
return ret ?: nbytes;
}
#endif
static struct cftype cpu_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
.name = "weight",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = cpu_weight_read_u64,
.write_u64 = cpu_weight_write_u64,
},
{
.name = "weight.nice",
.flags = CFTYPE_NOT_ON_ROOT,
.read_s64 = cpu_weight_nice_read_s64,
.write_s64 = cpu_weight_nice_write_s64,
},
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
.name = "max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_max_show,
.write = cpu_max_write,
},
#endif
{ } /* terminate */
};
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.legacy_cftypes = cpu_files,
.legacy_cftypes = cpu_legacy_files,
.dfl_cftypes = cpu_files,
.early_init = true,
.threaded = true,
};
#endif /* CONFIG_CGROUP_SCHED */
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifdef CONFIG_CGROUP_CPUACCT
extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
}
static inline void
cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
{
}
#endif
......@@ -109,7 +109,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
*/
__this_cpu_add(kernel_cpustat.cpustat[index], tmp);
cpuacct_account_field(p, index, tmp);
cgroup_account_cputime_field(p, index, tmp);
}
/*
......@@ -446,6 +446,13 @@ void vtime_account_irq_enter(struct task_struct *tsk)
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
u64 *ut, u64 *st)
{
*ut = curr->utime;
*st = curr->stime;
}
void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
*ut = p->utime;
......@@ -584,8 +591,7 @@ static u64 scale_stime(u64 stime, u64 rtime, u64 total)
*
* Assuming that rtime_i+1 >= rtime_i.
*/
static void cputime_adjust(struct task_cputime *curr,
struct prev_cputime *prev,
void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
u64 *ut, u64 *st)
{
u64 rtime, stime, utime;
......
......@@ -1144,7 +1144,7 @@ static void update_curr_dl(struct rq *rq)
account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq_clock_task(rq);
cpuacct_charge(curr, delta_exec);
cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
......
......@@ -844,7 +844,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cpuacct_charge(curtask, delta_exec);
cgroup_account_cputime(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
......
......@@ -969,7 +969,7 @@ static void update_curr_rt(struct rq *rq)
account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq_clock_task(rq);
cpuacct_charge(curr, delta_exec);
cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
......
......@@ -30,6 +30,7 @@
#include <linux/irq_work.h>
#include <linux/tick.h>
#include <linux/slab.h>
#include <linux/cgroup.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
......@@ -37,7 +38,6 @@
#include "cpupri.h"
#include "cpudeadline.h"
#include "cpuacct.h"
#ifdef CONFIG_SCHED_DEBUG
# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
......
......@@ -72,7 +72,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq_clock_task(rq);
cpuacct_charge(curr, delta_exec);
cgroup_account_cputime(curr, delta_exec);
}
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment