diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index dc44785dc0fa..3f8216912df0 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -886,6 +886,15 @@ All cgroup core files are prefixed with "cgroup." A dying cgroup can consume system resources not exceeding limits, which were active at the moment of cgroup deletion. + cpu.usage_usec + CPU time consumed in the subtree. + + cpu.user_usec + User CPU time consumed in the subtree. + + cpu.system_usec + System CPU time consumed in the subtree. + Controllers =========== diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ade4a78a54c2..3e55bbd31ad1 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -254,6 +255,57 @@ struct css_set { struct rcu_head rcu_head; }; +/* + * cgroup basic resource usage statistics. Accounting is done per-cpu in + * cgroup_cpu_stat which is then lazily propagated up the hierarchy on + * reads. + * + * When a stat gets updated, the cgroup_cpu_stat and its ancestors are + * linked into the updated tree. On the following read, propagation only + * considers and consumes the updated tree. This makes reading O(the + * number of descendants which have been active since last read) instead of + * O(the total number of descendants). + * + * This is important because there can be a lot of (draining) cgroups which + * aren't active and stat may be read frequently. The combination can + * become very expensive. By propagating selectively, increasing reading + * frequency decreases the cost of each read. + */ +struct cgroup_cpu_stat { + /* + * ->sync protects all the current counters. These are the only + * fields which get updated in the hot path. + */ + struct u64_stats_sync sync; + struct task_cputime cputime; + + /* + * Snapshots at the last reading. These are used to calculate the + * deltas to propagate to the global counters. + */ + struct task_cputime last_cputime; + + /* + * Child cgroups with stat updates on this cpu since the last read + * are linked on the parent's ->updated_children through + * ->updated_next. + * + * In addition to being more compact, singly-linked list pointing + * to the cgroup makes it unnecessary for each per-cpu struct to + * point back to the associated cgroup. + * + * Protected by per-cpu cgroup_cpu_stat_lock. + */ + struct cgroup *updated_children; /* terminated by self cgroup */ + struct cgroup *updated_next; /* NULL iff not on the list */ +}; + +struct cgroup_stat { + /* per-cpu statistics are collected into the folowing global counters */ + struct task_cputime cputime; + struct prev_cputime prev_cputime; +}; + struct cgroup { /* self css with NULL ->ss, points back to this cgroup */ struct cgroup_subsys_state self; @@ -353,6 +405,11 @@ struct cgroup { */ struct cgroup *dom_cgrp; + /* cgroup basic resource statistics */ + struct cgroup_cpu_stat __percpu *cpu_stat; + struct cgroup_stat pending_stat; /* pending from children */ + struct cgroup_stat stat; + /* * list of pidlists, up to two for each namespace (one for procs, one * for tasks); created on demand. diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6cd579329310..328a70ce0e23 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -703,17 +703,39 @@ static inline void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) {} #endif +void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix); + +void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec); +void __cgroup_account_cputime_field(struct cgroup *cgrp, + enum cpu_usage_stat index, u64 delta_exec); + static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) { + struct cgroup *cgrp; + cpuacct_charge(task, delta_exec); + + rcu_read_lock(); + cgrp = task_dfl_cgroup(task); + if (cgroup_parent(cgrp)) + __cgroup_account_cputime(cgrp, delta_exec); + rcu_read_unlock(); } static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) { + struct cgroup *cgrp; + cpuacct_account_field(task, index, delta_exec); + + rcu_read_lock(); + cgrp = task_dfl_cgroup(task); + if (cgroup_parent(cgrp)) + __cgroup_account_cputime_field(cgrp, index, delta_exec); + rcu_read_unlock(); } #else /* CONFIG_CGROUPS */ diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index ce693ccb8c58..0acee616e06c 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,4 +1,4 @@ -obj-y := cgroup.o namespace.o cgroup-v1.o +obj-y := cgroup.o stat.o namespace.o cgroup-v1.o obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 5151ff256c29..fa642c99586a 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -199,6 +199,14 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, int cgroup_task_count(const struct cgroup *cgrp); +/* + * stat.c + */ +void cgroup_stat_flush(struct cgroup *cgrp); +int cgroup_stat_init(struct cgroup *cgrp); +void cgroup_stat_exit(struct cgroup *cgrp); +void cgroup_stat_boot(void); + /* * namespace.c */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6551cd45238..d036625556c9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS +static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat); + /* * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -struct cgroup_root cgrp_dfl_root; +struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* @@ -3301,6 +3303,8 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); + cgroup_stat_show_cputime(seq, "cpu."); + return 0; } @@ -4471,6 +4475,8 @@ static void css_free_work_fn(struct work_struct *work) */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); + if (cgroup_on_dfl(cgrp)) + cgroup_stat_exit(cgrp); kfree(cgrp); } else { /* @@ -4515,6 +4521,9 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ trace_cgroup_release(cgrp); + if (cgroup_on_dfl(cgrp)) + cgroup_stat_flush(cgrp); + for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; @@ -4698,6 +4707,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_free_cgrp; + if (cgroup_on_dfl(parent)) { + ret = cgroup_stat_init(cgrp); + if (ret) + goto out_cancel_ref; + } + /* * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. @@ -4705,7 +4720,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); if (cgrp->id < 0) { ret = -ENOMEM; - goto out_cancel_ref; + goto out_stat_exit; } init_cgroup_housekeeping(cgrp); @@ -4754,6 +4769,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) return cgrp; +out_stat_exit: + if (cgroup_on_dfl(parent)) + cgroup_stat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5148,6 +5166,8 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); + cgroup_stat_boot(); + /* * The latency of the synchronize_sched() is too high for cgroups, * avoid it at the cost of forcing all readers into the slow path. diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c new file mode 100644 index 000000000000..9cce79e89320 --- /dev/null +++ b/kernel/cgroup/stat.c @@ -0,0 +1,334 @@ +#include "cgroup-internal.h" + +#include + +static DEFINE_MUTEX(cgroup_stat_mutex); +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); + +static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) +{ + return per_cpu_ptr(cgrp->cpu_stat, cpu); +} + +/** + * cgroup_cpu_stat_updated - keep track of updated cpu_stat + * @cgrp: target cgroup + * @cpu: cpu on which cpu_stat was updated + * + * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching + * cpu_stat->updated_children list. See the comment on top of + * cgroup_cpu_stat definition for details. + */ +static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) +{ + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + struct cgroup *parent; + unsigned long flags; + + /* + * Speculative already-on-list test. This may race leading to + * temporary inaccuracies, which is fine. + * + * Because @parent's updated_children is terminated with @parent + * instead of NULL, we can tell whether @cgrp is on the list by + * testing the next pointer for NULL. + */ + if (cgroup_cpu_stat(cgrp, cpu)->updated_next) + return; + + raw_spin_lock_irqsave(cpu_lock, flags); + + /* put @cgrp and all ancestors on the corresponding updated lists */ + for (parent = cgroup_parent(cgrp); parent; + cgrp = parent, parent = cgroup_parent(cgrp)) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); + + /* + * Both additions and removals are bottom-up. If a cgroup + * is already in the tree, all ancestors are. + */ + if (cstat->updated_next) + break; + + cstat->updated_next = pcstat->updated_children; + pcstat->updated_children = cgrp; + } + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} + +/** + * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree + * @pos: current position + * @root: root of the tree to traversal + * @cpu: target cpu + * + * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts + * the traversal and %NULL return indicates the end. During traversal, + * each returned cgroup is unlinked from the tree. Must be called with the + * matching cgroup_cpu_stat_lock held. + * + * The only ordering guarantee is that, for a parent and a child pair + * covered by a given traversal, if a child is visited, its parent is + * guaranteed to be visited afterwards. + */ +static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, + struct cgroup *root, int cpu) +{ + struct cgroup_cpu_stat *cstat; + struct cgroup *parent; + + if (pos == root) + return NULL; + + /* + * We're gonna walk down to the first leaf and visit/remove it. We + * can pick whatever unvisited node as the starting point. + */ + if (!pos) + pos = root; + else + pos = cgroup_parent(pos); + + /* walk down to the first leaf */ + while (true) { + cstat = cgroup_cpu_stat(pos, cpu); + if (cstat->updated_children == pos) + break; + pos = cstat->updated_children; + } + + /* + * Unlink @pos from the tree. As the updated_children list is + * singly linked, we have to walk it to find the removal point. + * However, due to the way we traverse, @pos will be the first + * child in most cases. The only exception is @root. + */ + parent = cgroup_parent(pos); + if (parent && cstat->updated_next) { + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); + struct cgroup_cpu_stat *ncstat; + struct cgroup **nextp; + + nextp = &pcstat->updated_children; + while (true) { + ncstat = cgroup_cpu_stat(*nextp, cpu); + if (*nextp == pos) + break; + + WARN_ON_ONCE(*nextp == parent); + nextp = &ncstat->updated_next; + } + + *nextp = cstat->updated_next; + cstat->updated_next = NULL; + } + + return pos; +} + +static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, + struct cgroup_stat *src_stat) +{ + dst_stat->cputime.utime += src_stat->cputime.utime; + dst_stat->cputime.stime += src_stat->cputime.stime; + dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; +} + +static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) +{ + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct task_cputime *last_cputime = &cstat->last_cputime; + struct task_cputime cputime; + struct cgroup_stat delta; + unsigned seq; + + lockdep_assert_held(&cgroup_stat_mutex); + + /* fetch the current per-cpu values */ + do { + seq = __u64_stats_fetch_begin(&cstat->sync); + cputime = cstat->cputime; + } while (__u64_stats_fetch_retry(&cstat->sync, seq)); + + /* accumulate the deltas to propgate */ + delta.cputime.utime = cputime.utime - last_cputime->utime; + delta.cputime.stime = cputime.stime - last_cputime->stime; + delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - + last_cputime->sum_exec_runtime; + *last_cputime = cputime; + + /* transfer the pending stat into delta */ + cgroup_stat_accumulate(&delta, &cgrp->pending_stat); + memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); + + /* propagate delta into the global stat and the parent's pending */ + cgroup_stat_accumulate(&cgrp->stat, &delta); + if (parent) + cgroup_stat_accumulate(&parent->pending_stat, &delta); +} + +/* see cgroup_stat_flush() */ +static void cgroup_stat_flush_locked(struct cgroup *cgrp) +{ + int cpu; + + lockdep_assert_held(&cgroup_stat_mutex); + + for_each_possible_cpu(cpu) { + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + struct cgroup *pos = NULL; + + raw_spin_lock_irq(cpu_lock); + while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) + cgroup_cpu_stat_flush_one(pos, cpu); + raw_spin_unlock_irq(cpu_lock); + } +} + +/** + * cgroup_stat_flush - flush stats in @cgrp's subtree + * @cgrp: target cgroup + * + * Collect all per-cpu stats in @cgrp's subtree into the global counters + * and propagate them upwards. After this function returns, all cgroups in + * the subtree have up-to-date ->stat. + * + * This also gets all cgroups in the subtree including @cgrp off the + * ->updated_children lists. + */ +void cgroup_stat_flush(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_stat_mutex); + cgroup_stat_flush_locked(cgrp); + mutex_unlock(&cgroup_stat_mutex); +} + +static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) +{ + struct cgroup_cpu_stat *cstat; + + cstat = get_cpu_ptr(cgrp->cpu_stat); + u64_stats_update_begin(&cstat->sync); + return cstat; +} + +static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, + struct cgroup_cpu_stat *cstat) +{ + u64_stats_update_end(&cstat->sync); + cgroup_cpu_stat_updated(cgrp, smp_processor_id()); + put_cpu_ptr(cstat); +} + +void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) +{ + struct cgroup_cpu_stat *cstat; + + cstat = cgroup_cpu_stat_account_begin(cgrp); + cstat->cputime.sum_exec_runtime += delta_exec; + cgroup_cpu_stat_account_end(cgrp, cstat); +} + +void __cgroup_account_cputime_field(struct cgroup *cgrp, + enum cpu_usage_stat index, u64 delta_exec) +{ + struct cgroup_cpu_stat *cstat; + + cstat = cgroup_cpu_stat_account_begin(cgrp); + + switch (index) { + case CPUTIME_USER: + case CPUTIME_NICE: + cstat->cputime.utime += delta_exec; + break; + case CPUTIME_SYSTEM: + case CPUTIME_IRQ: + case CPUTIME_SOFTIRQ: + cstat->cputime.stime += delta_exec; + break; + default: + break; + } + + cgroup_cpu_stat_account_end(cgrp, cstat); +} + +void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + u64 usage, utime, stime; + + if (!cgroup_parent(cgrp)) + return; + + mutex_lock(&cgroup_stat_mutex); + + cgroup_stat_flush_locked(cgrp); + + usage = cgrp->stat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, + &utime, &stime); + + mutex_unlock(&cgroup_stat_mutex); + + do_div(usage, NSEC_PER_USEC); + do_div(utime, NSEC_PER_USEC); + do_div(stime, NSEC_PER_USEC); + + seq_printf(seq, "%susage_usec %llu\n" + "%suser_usec %llu\n" + "%ssystem_usec %llu\n", + prefix, usage, prefix, utime, prefix, stime); +} + +int cgroup_stat_init(struct cgroup *cgrp) +{ + int cpu; + + /* the root cgrp has cpu_stat preallocated */ + if (!cgrp->cpu_stat) { + cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); + if (!cgrp->cpu_stat) + return -ENOMEM; + } + + /* ->updated_children list is self terminated */ + for_each_possible_cpu(cpu) + cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; + + prev_cputime_init(&cgrp->stat.prev_cputime); + + return 0; +} + +void cgroup_stat_exit(struct cgroup *cgrp) +{ + int cpu; + + cgroup_stat_flush(cgrp); + + /* sanity check */ + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + if (WARN_ON_ONCE(cstat->updated_children != cgrp) || + WARN_ON_ONCE(cstat->updated_next)) + return; + } + + free_percpu(cgrp->cpu_stat); + cgrp->cpu_stat = NULL; +} + +void __init cgroup_stat_boot(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); + + BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); +}