From 61e867fde21ea94f2166899f24f16cad85cc7b24 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sun, 29 Sep 2019 16:06:58 +0800 Subject: [PATCH 01/24] cgroup: short-circuit current_cgns_cgroup_from_root() on the default hierarchy Like commit 13d82fb77abb ("cgroup: short-circuit cset_cgroup_from_root() on the default hierarchy"), short-circuit current_cgns_cgroup_from_root() on the default hierarchy. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 080561bb8a4b..f6cba23290a1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1374,6 +1374,8 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) cset = current->nsproxy->cgroup_ns->root_cset; if (cset == &init_css_set) { res = &root->cgrp; + } else if (root == &cgrp_dfl_root) { + res = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; From e7c7b1d85dc1646c874096dac3cf01537c1fd6f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 4 Oct 2019 12:57:39 +0200 Subject: [PATCH 02/24] cgroup: Update comments about task exit path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We no longer take cgroup_mutex in cgroup_exit and the exiting tasks are not moved to init_css_set, reflect that in several comments to prevent confusion. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f6cba23290a1..01fc24aeac71 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -899,8 +899,7 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_exit() changing the css_set to - * init_css_set and dropping the old one. + * against cgroup_exit()/cgroup_free() dropping the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -1432,9 +1431,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) { /* - * No need to lock the task - since we hold cgroup_mutex the - * task can't change groups, so the only thing that can happen - * is that it exits and its css is set back to init_css_set. + * No need to lock the task - since we hold css_set_lock the + * task can't change groups. */ return cset_cgroup_from_root(task_css_set(task), root); } @@ -6030,7 +6028,7 @@ void cgroup_post_fork(struct task_struct *child) struct css_set *cset; spin_lock_irq(&css_set_lock); - cset = task_css_set(current); + cset = task_css_set(current); /* current is @child's parent */ if (list_empty(&child->cg_list)) { get_css_set(cset); cset->nr_tasks++; @@ -6073,20 +6071,8 @@ void cgroup_post_fork(struct task_struct *child) * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * - * Description: Detach cgroup from @tsk and release it. + * Description: Detach cgroup from @tsk. * - * Note that cgroups marked notify_on_release force every task in - * them to take the global cgroup_mutex mutex when exiting. - * This could impact scaling on very large systems. Be reluctant to - * use notify_on_release cgroups where very high task exit scaling - * is required on large systems. - * - * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We - * call cgroup_exit() while the task is still competent to handle - * notify_on_release(), then leave the task attached to the root cgroup in - * each hierarchy for the remainder of its exit. No need to bother with - * init_css_set refcnting. init_css_set never goes away and we can't race - * with migration path - PF_EXITING is visible to migration path. */ void cgroup_exit(struct task_struct *tsk) { @@ -6096,7 +6082,8 @@ void cgroup_exit(struct task_struct *tsk) /* * Unlink from @tsk from its css_set. As migration path can't race - * with us, we can check css_set and cg_list without synchronization. + * with us (thanks to cgroup_threadgroup_rwsem), we can check css_set + * and cg_list without synchronization. */ cset = task_css_set(tsk); @@ -6112,6 +6099,8 @@ void cgroup_exit(struct task_struct *tsk) spin_unlock_irq(&css_set_lock); } else { + /* Take reference to avoid freeing init_css_set in cgroup_free, + * see cgroup_fork(). */ get_css_set(cset); } From 9a3284fad42f66bb43629c6716709ff791aaa457 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 4 Oct 2019 12:57:40 +0200 Subject: [PATCH 03/24] cgroup: Optimize single thread migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are reports of users who use thread migrations between cgroups and they report performance drop after d59cfc09c32a ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem"). The effect is pronounced on machines with more CPUs. The migration is affected by forking noise happening in the background, after the mentioned commit a migrating thread must wait for all (forking) processes on the system, not only of its threadgroup. There are several places that need to synchronize with migration: a) do_exit, b) de_thread, c) copy_process, d) cgroup_update_dfl_csses, e) parallel migration (cgroup_{proc,thread}s_write). In the case of self-migrating thread, we relax the synchronization on cgroup_threadgroup_rwsem to avoid the cost of waiting. d) and e) are excluded with cgroup_mutex, c) does not matter in case of single thread migration and the executing thread cannot exec(2) or exit(2) while it is writing into cgroup.threads. In case of do_exit because of signal delivery, we either exit before the migration or finish the migration (of not yet PF_EXITING thread) and die afterwards. This patch handles only the case of self-migration by writing "0" into cgroup.threads. For simplicity, we always take cgroup_threadgroup_rwsem with numeric PIDs. This change improves migration dependent workload performance similar to per-signal_struct state. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 5 +++-- kernel/cgroup/cgroup-v1.c | 5 +++-- kernel/cgroup/cgroup.c | 39 +++++++++++++++++++++++++-------- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 809e34a3c017..90d1710fef6c 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -231,9 +231,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, + bool *locked) __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task) +void cgroup_procs_write_finish(struct task_struct *task, bool locked) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 7f83f4121d8d..09f3a413f6f8 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -495,12 +495,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; + bool locked; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup); + task = cgroup_procs_write_start(buf, threadgroup, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -522,7 +523,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 01fc24aeac71..8b1c4fd47a7a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2824,7 +2824,8 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, return ret; } -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, + bool *locked) __acquires(&cgroup_threadgroup_rwsem) { struct task_struct *tsk; @@ -2833,7 +2834,21 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); - percpu_down_write(&cgroup_threadgroup_rwsem); + /* + * If we migrate a single thread, we don't care about threadgroup + * stability. If the thread is `current`, it won't exit(2) under our + * hands or change PID through exec(2). We exclude + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write + * callers by cgroup_mutex. + * Therefore, we can skip the global lock. + */ + lockdep_assert_held(&cgroup_mutex); + if (pid || threadgroup) { + percpu_down_write(&cgroup_threadgroup_rwsem); + *locked = true; + } else { + *locked = false; + } rcu_read_lock(); if (pid) { @@ -2864,13 +2879,16 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) goto out_unlock_rcu; out_unlock_threadgroup: - percpu_up_write(&cgroup_threadgroup_rwsem); + if (*locked) { + percpu_up_write(&cgroup_threadgroup_rwsem); + *locked = false; + } out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task) +void cgroup_procs_write_finish(struct task_struct *task, bool locked) __releases(&cgroup_threadgroup_rwsem) { struct cgroup_subsys *ss; @@ -2879,7 +2897,8 @@ void cgroup_procs_write_finish(struct task_struct *task) /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - percpu_up_write(&cgroup_threadgroup_rwsem); + if (locked) + percpu_up_write(&cgroup_threadgroup_rwsem); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -4754,12 +4773,13 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; ssize_t ret; + bool locked; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, true); + task = cgroup_procs_write_start(buf, true, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4777,7 +4797,7 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(dst_cgrp, task, true); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); @@ -4795,6 +4815,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; ssize_t ret; + bool locked; buf = strstrip(buf); @@ -4802,7 +4823,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, false); + task = cgroup_procs_write_start(buf, false, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4826,7 +4847,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, ret = cgroup_attach_task(dst_cgrp, task, false); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); From 58c9f75b86f76895b9dd44f21dc1e37d0f477cc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 4 Oct 2019 12:57:41 +0200 Subject: [PATCH 04/24] selftests: cgroup: Simplify task self migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify task migration by being oblivious about its PID during migration. This allows to easily migrate individual threads as well. This change brings no functional change and prepares grounds for thread granularity migrating tests. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/cgroup_util.c | 16 +++++++++++----- tools/testing/selftests/cgroup/cgroup_util.h | 4 +++- tools/testing/selftests/cgroup/test_freezer.c | 2 +- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index bdb69599c4bd..f6573eac1365 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -282,10 +282,12 @@ int cg_enter(const char *cgroup, int pid) int cg_enter_current(const char *cgroup) { - char pidbuf[64]; + return cg_write(cgroup, "cgroup.procs", "0"); +} - snprintf(pidbuf, sizeof(pidbuf), "%d", getpid()); - return cg_write(cgroup, "cgroup.procs", pidbuf); +int cg_enter_current_thread(const char *cgroup) +{ + return cg_write(cgroup, "cgroup.threads", "0"); } int cg_run(const char *cgroup, @@ -410,11 +412,15 @@ int set_oom_adj_score(int pid, int score) return 0; } -char proc_read_text(int pid, const char *item, char *buf, size_t size) +ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size) { char path[PATH_MAX]; - snprintf(path, sizeof(path), "/proc/%d/%s", pid, item); + if (!pid) + snprintf(path, sizeof(path), "/proc/%s/%s", + thread ? "thread-self" : "self", item); + else + snprintf(path, sizeof(path), "/proc/%d/%s", pid, item); return read_text(path, buf, size); } diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index c72f28046bfa..27ff21d82af1 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include #include #define PAGE_SIZE 4096 @@ -35,6 +36,7 @@ extern int cg_run(const char *cgroup, void *arg); extern int cg_enter(const char *cgroup, int pid); extern int cg_enter_current(const char *cgroup); +extern int cg_enter_current_thread(const char *cgroup); extern int cg_run_nowait(const char *cgroup, int (*fn)(const char *cgroup, void *arg), void *arg); @@ -45,4 +47,4 @@ extern int is_swap_enabled(void); extern int set_oom_adj_score(int pid, int score); extern int cg_wait_for_proc_count(const char *cgroup, int count); extern int cg_killall(const char *cgroup); -extern char proc_read_text(int pid, const char *item, char *buf, size_t size); +extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size); diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index 0fc1b6d4b0f9..aadc52a1a1b1 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -701,7 +701,7 @@ static int proc_check_stopped(int pid) char buf[PAGE_SIZE]; int len; - len = proc_read_text(pid, "stat", buf, sizeof(buf)); + len = proc_read_text(pid, 0, "stat", buf, sizeof(buf)); if (len == -1) { debug("Can't get %d stat\n", pid); return -1; From 11318989c381d2b08b4e361a03df09d54434b26c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 4 Oct 2019 12:57:42 +0200 Subject: [PATCH 05/24] selftests: cgroup: Add task migration tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add two new tests that verify that thread and threadgroup migrations work as expected. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/Makefile | 2 +- tools/testing/selftests/cgroup/cgroup_util.c | 26 ++++ tools/testing/selftests/cgroup/cgroup_util.h | 2 + tools/testing/selftests/cgroup/test_core.c | 146 +++++++++++++++++++ 4 files changed, 175 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index 8d369b6a2069..1c9179400be0 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -Wall +CFLAGS += -Wall -pthread all: diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index f6573eac1365..8f7131dcf1ff 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -158,6 +158,22 @@ long cg_read_key_long(const char *cgroup, const char *control, const char *key) return atol(ptr + strlen(key)); } +long cg_read_lc(const char *cgroup, const char *control) +{ + char buf[PAGE_SIZE]; + const char delim[] = "\n"; + char *line; + long cnt = 0; + + if (cg_read(cgroup, control, buf, sizeof(buf))) + return -1; + + for (line = strtok(buf, delim); line; line = strtok(NULL, delim)) + cnt++; + + return cnt; +} + int cg_write(const char *cgroup, const char *control, char *buf) { char path[PATH_MAX]; @@ -424,3 +440,13 @@ ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t return read_text(path, buf, size); } + +int proc_read_strstr(int pid, bool thread, const char *item, const char *needle) +{ + char buf[PAGE_SIZE]; + + if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0) + return -1; + + return strstr(buf, needle) ? 0 : -1; +} diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index 27ff21d82af1..49c54fbdb229 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -30,6 +30,7 @@ extern int cg_read_strstr(const char *cgroup, const char *control, const char *needle); extern long cg_read_long(const char *cgroup, const char *control); long cg_read_key_long(const char *cgroup, const char *control, const char *key); +extern long cg_read_lc(const char *cgroup, const char *control); extern int cg_write(const char *cgroup, const char *control, char *buf); extern int cg_run(const char *cgroup, int (*fn)(const char *cgroup, void *arg), @@ -48,3 +49,4 @@ extern int set_oom_adj_score(int pid, int score); extern int cg_wait_for_proc_count(const char *cgroup, int count); extern int cg_killall(const char *cgroup); extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size); +extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle); diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index 79053a4f4783..c5ca669feb2b 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -5,6 +5,9 @@ #include #include #include +#include +#include +#include #include "../kselftest.h" #include "cgroup_util.h" @@ -354,6 +357,147 @@ cleanup: return ret; } +static void *dummy_thread_fn(void *arg) +{ + return (void *)(size_t)pause(); +} + +/* + * Test threadgroup migration. + * All threads of a process are migrated together. + */ +static int test_cgcore_proc_migration(const char *root) +{ + int ret = KSFT_FAIL; + int t, c_threads, n_threads = 13; + char *src = NULL, *dst = NULL; + pthread_t threads[n_threads]; + + src = cg_name(root, "cg_src"); + dst = cg_name(root, "cg_dst"); + if (!src || !dst) + goto cleanup; + + if (cg_create(src)) + goto cleanup; + if (cg_create(dst)) + goto cleanup; + + if (cg_enter_current(src)) + goto cleanup; + + for (c_threads = 0; c_threads < n_threads; ++c_threads) { + if (pthread_create(&threads[c_threads], NULL, dummy_thread_fn, NULL)) + goto cleanup; + } + + cg_enter_current(dst); + if (cg_read_lc(dst, "cgroup.threads") != n_threads + 1) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + for (t = 0; t < c_threads; ++t) { + pthread_cancel(threads[t]); + } + + for (t = 0; t < c_threads; ++t) { + pthread_join(threads[t], NULL); + } + + cg_enter_current(root); + + if (dst) + cg_destroy(dst); + if (src) + cg_destroy(src); + free(dst); + free(src); + return ret; +} + +static void *migrating_thread_fn(void *arg) +{ + int g, i, n_iterations = 1000; + char **grps = arg; + char lines[3][PATH_MAX]; + + for (g = 1; g < 3; ++g) + snprintf(lines[g], sizeof(lines[g]), "0::%s", grps[g] + strlen(grps[0])); + + for (i = 0; i < n_iterations; ++i) { + cg_enter_current_thread(grps[(i % 2) + 1]); + + if (proc_read_strstr(0, 1, "cgroup", lines[(i % 2) + 1])) + return (void *)-1; + } + return NULL; +} + +/* + * Test single thread migration. + * Threaded cgroups allow successful migration of a thread. + */ +static int test_cgcore_thread_migration(const char *root) +{ + int ret = KSFT_FAIL; + char *dom = NULL; + char line[PATH_MAX]; + char *grps[3] = { (char *)root, NULL, NULL }; + pthread_t thr; + void *retval; + + dom = cg_name(root, "cg_dom"); + grps[1] = cg_name(root, "cg_dom/cg_src"); + grps[2] = cg_name(root, "cg_dom/cg_dst"); + if (!grps[1] || !grps[2] || !dom) + goto cleanup; + + if (cg_create(dom)) + goto cleanup; + if (cg_create(grps[1])) + goto cleanup; + if (cg_create(grps[2])) + goto cleanup; + + if (cg_write(grps[1], "cgroup.type", "threaded")) + goto cleanup; + if (cg_write(grps[2], "cgroup.type", "threaded")) + goto cleanup; + + if (cg_enter_current(grps[1])) + goto cleanup; + + if (pthread_create(&thr, NULL, migrating_thread_fn, grps)) + goto cleanup; + + if (pthread_join(thr, &retval)) + goto cleanup; + + if (retval) + goto cleanup; + + snprintf(line, sizeof(line), "0::%s", grps[1] + strlen(grps[0])); + if (proc_read_strstr(0, 1, "cgroup", line)) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + cg_enter_current(root); + if (grps[2]) + cg_destroy(grps[2]); + if (grps[1]) + cg_destroy(grps[1]); + if (dom) + cg_destroy(dom); + free(grps[2]); + free(grps[1]); + free(dom); + return ret; +} + #define T(x) { x, #x } struct corecg_test { int (*fn)(const char *root); @@ -366,6 +510,8 @@ struct corecg_test { T(test_cgcore_parent_becomes_threaded), T(test_cgcore_invalid_domain), T(test_cgcore_populated), + T(test_cgcore_proc_migration), + T(test_cgcore_thread_migration), }; #undef T From 1a99fcc035fb666f5d0909aef0a753284cb5d04c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 4 Oct 2019 12:57:43 +0200 Subject: [PATCH 06/24] selftests: cgroup: Run test_core under interfering stress MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_core tests various cgroup creation/removal and task migration paths. Run the tests repeatedly with interfering noise (for lockdep checks). Currently, forking noise and subsystem enabled/disabled switching are the implemented noises. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/Makefile | 2 + tools/testing/selftests/cgroup/test_stress.sh | 4 + tools/testing/selftests/cgroup/with_stress.sh | 101 ++++++++++++++++++ 3 files changed, 107 insertions(+) create mode 100755 tools/testing/selftests/cgroup/test_stress.sh create mode 100755 tools/testing/selftests/cgroup/with_stress.sh diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index 1c9179400be0..66aafe1f5746 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -3,6 +3,8 @@ CFLAGS += -Wall -pthread all: +TEST_FILES := with_stress.sh +TEST_PROGS := test_stress.sh TEST_GEN_PROGS = test_memcontrol TEST_GEN_PROGS += test_core TEST_GEN_PROGS += test_freezer diff --git a/tools/testing/selftests/cgroup/test_stress.sh b/tools/testing/selftests/cgroup/test_stress.sh new file mode 100755 index 000000000000..15d9d5896394 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_stress.sh @@ -0,0 +1,4 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +./with_stress.sh -s subsys -s fork ./test_core diff --git a/tools/testing/selftests/cgroup/with_stress.sh b/tools/testing/selftests/cgroup/with_stress.sh new file mode 100755 index 000000000000..e28c35008f5b --- /dev/null +++ b/tools/testing/selftests/cgroup/with_stress.sh @@ -0,0 +1,101 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +stress_fork() +{ + while true ; do + /usr/bin/true + sleep 0.01 + done +} + +stress_subsys() +{ + local verb=+ + while true ; do + echo $verb$subsys_ctrl >$sysfs/cgroup.subtree_control + [ $verb = "+" ] && verb=- || verb=+ + # incommensurable period with other stresses + sleep 0.011 + done +} + +init_and_check() +{ + sysfs=`mount -t cgroup2 | head -1 | awk '{ print $3 }'` + if [ ! -d "$sysfs" ]; then + echo "Skipping: cgroup2 is not mounted" >&2 + exit $ksft_skip + fi + + if ! echo +$subsys_ctrl >$sysfs/cgroup.subtree_control ; then + echo "Skipping: cannot enable $subsys_ctrl in $sysfs" >&2 + exit $ksft_skip + fi + + if ! echo -$subsys_ctrl >$sysfs/cgroup.subtree_control ; then + echo "Skipping: cannot disable $subsys_ctrl in $sysfs" >&2 + exit $ksft_skip + fi +} + +declare -a stresses +declare -a stress_pids +duration=5 +rc=0 +subsys_ctrl=cpuset +sysfs= + +while getopts c:d:hs: opt; do + case $opt in + c) + subsys_ctrl=$OPTARG + ;; + d) + duration=$OPTARG + ;; + h) + echo "Usage $0 [ -s stress ] ... [ -d duration ] [-c controller] cmd args .." + echo -e "\t default duration $duration seconds" + echo -e "\t default controller $subsys_ctrl" + exit + ;; + s) + func=stress_$OPTARG + if [ "x$(type -t $func)" != "xfunction" ] ; then + echo "Unknown stress $OPTARG" + exit 1 + fi + stresses+=($func) + ;; + esac +done +shift $((OPTIND - 1)) + +init_and_check + +for s in ${stresses[*]} ; do + $s & + stress_pids+=($!) +done + + +time=0 +start=$(date +%s) + +while [ $time -lt $duration ] ; do + $* + rc=$? + [ $rc -eq 0 ] || break + time=$(($(date +%s) - $start)) +done + +for pid in ${stress_pids[*]} ; do + kill -SIGTERM $pid + wait $pid +done + +exit $rc From a713af394cf382a30dd28a1015cbe572f1b9ca75 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 17 Oct 2019 02:50:01 +1100 Subject: [PATCH 07/24] cgroup: pids: use atomic64_t for pids->limit Because pids->limit can be changed concurrently (but we don't want to take a lock because it would be needlessly expensive), use atomic64_ts instead. Fixes: commit 49b786ea146f ("cgroup: implement the PIDs subsystem") Cc: stable@vger.kernel.org # v4.3+ Signed-off-by: Aleksa Sarai Signed-off-by: Tejun Heo --- kernel/cgroup/pids.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 8e513a573fe9..138059eb730d 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -45,7 +45,7 @@ struct pids_cgroup { * %PIDS_MAX = (%PID_MAX_LIMIT + 1). */ atomic64_t counter; - int64_t limit; + atomic64_t limit; /* Handle for "pids.events" */ struct cgroup_file events_file; @@ -73,8 +73,8 @@ pids_css_alloc(struct cgroup_subsys_state *parent) if (!pids) return ERR_PTR(-ENOMEM); - pids->limit = PIDS_MAX; atomic64_set(&pids->counter, 0); + atomic64_set(&pids->limit, PIDS_MAX); atomic64_set(&pids->events_limit, 0); return &pids->css; } @@ -146,13 +146,14 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) for (p = pids; parent_pids(p); p = parent_pids(p)) { int64_t new = atomic64_add_return(num, &p->counter); + int64_t limit = atomic64_read(&p->limit); /* * Since new is capped to the maximum number of pid_t, if * p->limit is %PIDS_MAX then we know that this test will never * fail. */ - if (new > p->limit) + if (new > limit) goto revert; } @@ -277,7 +278,7 @@ set_limit: * Limit updates don't need to be mutex'd, since it isn't * critical that any racing fork()s follow the new limit. */ - pids->limit = limit; + atomic64_set(&pids->limit, limit); return nbytes; } @@ -285,7 +286,7 @@ static int pids_max_show(struct seq_file *sf, void *v) { struct cgroup_subsys_state *css = seq_css(sf); struct pids_cgroup *pids = css_pids(css); - int64_t limit = pids->limit; + int64_t limit = atomic64_read(&pids->limit); if (limit >= PIDS_MAX) seq_printf(sf, "%s\n", PIDS_MAX_STR); From 5153faac18d293fc7abb19ff7034683fbcd82dc7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Oct 2019 12:03:51 -0700 Subject: [PATCH 08/24] cgroup: remove cgroup_enable_task_cg_lists() optimization cgroup_enable_task_cg_lists() is used to lazyily initialize task cgroup associations on the first use to reduce fork / exit overheads on systems which don't use cgroup. Unfortunately, locking around it has never been actually correct and its value is dubious given how the vast majority of systems use cgroup right away from boot. This patch removes the optimization. For now, replace the cg_list based branches with WARN_ON_ONCE()'s to be on the safe side. We can simplify the logic further in the future. Signed-off-by: Tejun Heo Reported-by: Oleg Nesterov Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 - kernel/cgroup/cgroup.c | 186 +++++++++-------------------------------- kernel/cgroup/cpuset.c | 2 - 3 files changed, 40 insertions(+), 149 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3ba3e6da13a6..f6b048902d6c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -150,7 +150,6 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); -void cgroup_enable_task_cg_lists(void); void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8b1c4fd47a7a..cf32c0c7a45d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1883,65 +1883,6 @@ static int cgroup_reconfigure(struct fs_context *fc) return 0; } -/* - * To reduce the fork() overhead for systems that are not actually using - * their cgroups capability, we don't maintain the lists running through - * each css_set to its tasks until we see the list actually used - in other - * words after the first mount. - */ -static bool use_task_css_set_links __read_mostly; - -void cgroup_enable_task_cg_lists(void) -{ - struct task_struct *p, *g; - - /* - * We need tasklist_lock because RCU is not safe against - * while_each_thread(). Besides, a forking task that has passed - * cgroup_post_fork() without seeing use_task_css_set_links = 1 - * is not guaranteed to have its child immediately visible in the - * tasklist if we walk through it with RCU. - */ - read_lock(&tasklist_lock); - spin_lock_irq(&css_set_lock); - - if (use_task_css_set_links) - goto out_unlock; - - use_task_css_set_links = true; - - do_each_thread(g, p) { - WARN_ON_ONCE(!list_empty(&p->cg_list) || - task_css_set(p) != &init_css_set); - - /* - * We should check if the process is exiting, otherwise - * it will race with cgroup_exit() in that the list - * entry won't be deleted though the process has exited. - * Do it while holding siglock so that we don't end up - * racing against cgroup_exit(). - * - * Interrupts were already disabled while acquiring - * the css_set_lock, so we do not need to disable it - * again when acquiring the sighand->siglock here. - */ - spin_lock(&p->sighand->siglock); - if (!(p->flags & PF_EXITING)) { - struct css_set *cset = task_css_set(p); - - if (!css_set_populated(cset)) - css_set_update_populated(cset, true); - list_add_tail(&p->cg_list, &cset->tasks); - get_css_set(cset); - cset->nr_tasks++; - } - spin_unlock(&p->sighand->siglock); - } while_each_thread(g, p); -out_unlock: - spin_unlock_irq(&css_set_lock); - read_unlock(&tasklist_lock); -} - static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -2187,13 +2128,6 @@ static int cgroup_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; - /* - * The first time anyone tries to mount a cgroup, enable the list - * linking each css_set to its tasks and fix up all existing tasks. - */ - if (!use_task_css_set_links) - cgroup_enable_task_cg_lists(); - ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); fc->fs_private = &ctx->kfc; @@ -2371,9 +2305,8 @@ static void cgroup_migrate_add_task(struct task_struct *task, if (task->flags & PF_EXITING) return; - /* leave @task alone if post_fork() hasn't linked it yet */ - if (list_empty(&task->cg_list)) - return; + /* cgroup_threadgroup_rwsem protects racing against forks */ + WARN_ON_ONCE(list_empty(&task->cg_list)); cset = task_css_set(task); if (!cset->mg_src_cgrp) @@ -4586,9 +4519,6 @@ repeat: void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { - /* no one should try to iterate before mounting cgroups */ - WARN_ON_ONCE(!use_task_css_set_links); - memset(it, 0, sizeof(*it)); spin_lock_irq(&css_set_lock); @@ -6022,62 +5952,38 @@ void cgroup_cancel_fork(struct task_struct *child) void cgroup_post_fork(struct task_struct *child) { struct cgroup_subsys *ss; + struct css_set *cset; int i; - /* - * This may race against cgroup_enable_task_cg_lists(). As that - * function sets use_task_css_set_links before grabbing - * tasklist_lock and we just went through tasklist_lock to add - * @child, it's guaranteed that either we see the set - * use_task_css_set_links or cgroup_enable_task_cg_lists() sees - * @child during its iteration. - * - * If we won the race, @child is associated with %current's - * css_set. Grabbing css_set_lock guarantees both that the - * association is stable, and, on completion of the parent's - * migration, @child is visible in the source of migration or - * already in the destination cgroup. This guarantee is necessary - * when implementing operations which need to migrate all tasks of - * a cgroup to another. - * - * Note that if we lose to cgroup_enable_task_cg_lists(), @child - * will remain in init_css_set. This is safe because all tasks are - * in the init_css_set before cg_links is enabled and there's no - * operation which transfers all tasks out of init_css_set. - */ - if (use_task_css_set_links) { - struct css_set *cset; + spin_lock_irq(&css_set_lock); - spin_lock_irq(&css_set_lock); - cset = task_css_set(current); /* current is @child's parent */ - if (list_empty(&child->cg_list)) { - get_css_set(cset); - cset->nr_tasks++; - css_set_move_task(child, NULL, cset, false); - } + WARN_ON_ONCE(!list_empty(&child->cg_list)); + cset = task_css_set(current); /* current is @child's parent */ + get_css_set(cset); + cset->nr_tasks++; + css_set_move_task(child, NULL, cset, false); + + /* + * If the cgroup has to be frozen, the new task has too. Let's set + * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the + * frozen state. + */ + if (unlikely(cgroup_task_freeze(child))) { + spin_lock(&child->sighand->siglock); + WARN_ON_ONCE(child->frozen); + child->jobctl |= JOBCTL_TRAP_FREEZE; + spin_unlock(&child->sighand->siglock); /* - * If the cgroup has to be frozen, the new task has too. - * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get - * the task into the frozen state. + * Calling cgroup_update_frozen() isn't required here, + * because it will be called anyway a bit later from + * do_freezer_trap(). So we avoid cgroup's transient switch + * from the frozen state and back. */ - if (unlikely(cgroup_task_freeze(child))) { - spin_lock(&child->sighand->siglock); - WARN_ON_ONCE(child->frozen); - child->jobctl |= JOBCTL_TRAP_FREEZE; - spin_unlock(&child->sighand->siglock); - - /* - * Calling cgroup_update_frozen() isn't required here, - * because it will be called anyway a bit later - * from do_freezer_trap(). So we avoid cgroup's - * transient switch from the frozen state and back. - */ - } - - spin_unlock_irq(&css_set_lock); } + spin_unlock_irq(&css_set_lock); + /* * Call ss->fork(). This must happen after @child is linked on * css_set; otherwise, @child might change state between ->fork() @@ -6101,29 +6007,19 @@ void cgroup_exit(struct task_struct *tsk) struct css_set *cset; int i; - /* - * Unlink from @tsk from its css_set. As migration path can't race - * with us (thanks to cgroup_threadgroup_rwsem), we can check css_set - * and cg_list without synchronization. - */ + spin_lock_irq(&css_set_lock); + + WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); + css_set_move_task(tsk, cset, NULL, false); + list_add_tail(&tsk->cg_list, &cset->dying_tasks); + cset->nr_tasks--; - if (!list_empty(&tsk->cg_list)) { - spin_lock_irq(&css_set_lock); - css_set_move_task(tsk, cset, NULL, false); - list_add_tail(&tsk->cg_list, &cset->dying_tasks); - cset->nr_tasks--; + WARN_ON_ONCE(cgroup_task_frozen(tsk)); + if (unlikely(cgroup_task_freeze(tsk))) + cgroup_update_frozen(task_dfl_cgroup(tsk)); - WARN_ON_ONCE(cgroup_task_frozen(tsk)); - if (unlikely(cgroup_task_freeze(tsk))) - cgroup_update_frozen(task_dfl_cgroup(tsk)); - - spin_unlock_irq(&css_set_lock); - } else { - /* Take reference to avoid freeing init_css_set in cgroup_free, - * see cgroup_fork(). */ - get_css_set(cset); - } + spin_unlock_irq(&css_set_lock); /* see cgroup_post_fork() for details */ do_each_subsys_mask(ss, i, have_exit_callback) { @@ -6140,12 +6036,10 @@ void cgroup_release(struct task_struct *task) ss->release(task); } while_each_subsys_mask(); - if (use_task_css_set_links) { - spin_lock_irq(&css_set_lock); - css_set_skip_task_iters(task_css_set(task), task); - list_del_init(&task->cg_list); - spin_unlock_irq(&css_set_lock); - } + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); } void cgroup_free(struct task_struct *task) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c52bc91f882b..faff8f99e8f2 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -928,8 +928,6 @@ static void rebuild_root_domains(void) lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); - cgroup_enable_task_cg_lists(); - rcu_read_lock(); /* From 1bb5ec2eec48dcab1d8ae3707e4a388da6a9c9dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 6 Nov 2019 12:49:57 -0800 Subject: [PATCH 09/24] cgroup: use cgroup->last_bstat instead of cgroup->bstat_pending for consistency cgroup->bstat_pending is used to determine the base stat delta to propagate to the parent. While correct, this is different from how percpu delta is determined for no good reason and the inconsistency makes the code more difficult to understand. This patch makes parent propagation delta calculation use the same method as percpu to global propagation. * cgroup_base_stat_accumulate() is renamed to cgroup_base_stat_add() and cgroup_base_stat_sub() is added. * percpu propagation calculation is updated to use the above helpers. * cgroup->bstat_pending is replaced with cgroup->last_bstat and updated to use the same calculation as percpu propagation. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 2 +- kernel/cgroup/rstat.c | 44 ++++++++++++++++++++----------------- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 430e219e3aba..4904b1ebd1ff 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -458,7 +458,7 @@ struct cgroup { struct list_head rstat_css_list; /* cgroup basic resource statistics */ - struct cgroup_base_stat pending_bstat; /* pending from children */ + struct cgroup_base_stat last_bstat; struct cgroup_base_stat bstat; struct prev_cputime prev_cputime; /* for printing out cputime */ diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index ca19b4c8acf5..b48b22d4deb6 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -304,44 +304,48 @@ void __init cgroup_rstat_boot(void) * Functions for cgroup basic resource statistics implemented on top of * rstat. */ -static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, - struct cgroup_base_stat *src_bstat) +static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime += src_bstat->cputime.utime; dst_bstat->cputime.stime += src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; } +static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) +{ + dst_bstat->cputime.utime -= src_bstat->cputime.utime; + dst_bstat->cputime.stime -= src_bstat->cputime.stime; + dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; +} + static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - struct task_cputime *last_cputime = &rstatc->last_bstat.cputime; - struct task_cputime cputime; - struct cgroup_base_stat delta; + struct cgroup_base_stat cur, delta; unsigned seq; /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); - cputime = rstatc->bstat.cputime; + cur.cputime = rstatc->bstat.cputime; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); - /* calculate the delta to propgate */ - delta.cputime.utime = cputime.utime - last_cputime->utime; - delta.cputime.stime = cputime.stime - last_cputime->stime; - delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - - last_cputime->sum_exec_runtime; - *last_cputime = cputime; + /* propagate percpu delta to global */ + delta = cur; + cgroup_base_stat_sub(&delta, &rstatc->last_bstat); + cgroup_base_stat_add(&cgrp->bstat, &delta); + cgroup_base_stat_add(&rstatc->last_bstat, &delta); - /* transfer the pending stat into delta */ - cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat); - memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat)); - - /* propagate delta into the global stat and the parent's pending */ - cgroup_base_stat_accumulate(&cgrp->bstat, &delta); - if (parent) - cgroup_base_stat_accumulate(&parent->pending_bstat, &delta); + /* propagate global delta to parent */ + if (parent) { + delta = cgrp->bstat; + cgroup_base_stat_sub(&delta, &cgrp->last_bstat); + cgroup_base_stat_add(&parent->bstat, &delta); + cgroup_base_stat_add(&cgrp->last_bstat, &delta); + } } static struct cgroup_rstat_cpu * From 742e8cd3e1ba6f19cad6d912f8d469df5557d0fd Mon Sep 17 00:00:00 2001 From: Honglei Wang Date: Wed, 30 Oct 2019 16:18:10 +0800 Subject: [PATCH 10/24] cgroup: freezer: don't change task and cgroups status unnecessarily It's not necessary to adjust the task state and revisit the state of source and destination cgroups if the cgroups are not in freeze state and the task itself is not frozen. And in this scenario, it wakes up the task who's not supposed to be ready to run. Don't do the unnecessary task state adjustment can help stop waking up the task without a reason. Signed-off-by: Honglei Wang Acked-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/cgroup/freezer.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 8cf010680678..3984dd6b8ddb 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -230,6 +230,15 @@ void cgroup_freezer_migrate_task(struct task_struct *task, if (task->flags & PF_KTHREAD) return; + /* + * It's not necessary to do changes if both of the src and dst cgroups + * are not freezing and task is not frozen. + */ + if (!test_bit(CGRP_FREEZE, &src->flags) && + !test_bit(CGRP_FREEZE, &dst->flags) && + !task->frozen) + return; + /* * Adjust counters of freezing and frozen tasks. * Note, that if the task is frozen, but the destination cgroup is not From d671fa6393d6788fc65555d4643b71cb3a361f36 Mon Sep 17 00:00:00 2001 From: Hewenliang Date: Mon, 11 Nov 2019 21:16:55 -0500 Subject: [PATCH 11/24] kselftests: cgroup: Avoid the reuse of fd after it is deallocated It is necessary to set fd to -1 when inotify_add_watch() fails in cg_prepare_for_wait. Otherwise the fd which has been closed in cg_prepare_for_wait may be misused in other functions such as cg_enter_and_wait_for_frozen and cg_freeze_wait. Fixes: 5313bfe425c8 ("selftests: cgroup: add freezer controller self-tests") Signed-off-by: Hewenliang Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_freezer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index aadc52a1a1b1..23d8fa4a3e4e 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -72,6 +72,7 @@ static int cg_prepare_for_wait(const char *cgroup) if (ret == -1) { debug("Error: inotify_add_watch() failed\n"); close(fd); + fd = -1; } return fd; From e23f568aa63f64cd6b355094224cc9356c0f696b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:29 -0800 Subject: [PATCH 12/24] kernfs: fix ino wrap-around detection When the 32bit ino wraps around, kernfs increments the generation number to distinguish reused ino instances. The wrap-around detection tests whether the allocated ino is lower than what the cursor but the cursor is pointing to the next ino to allocate so the condition never triggers. Fix it by remembering the last ino and comparing against that. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Fixes: 4a3ef68acacf ("kernfs: implement i_generation") Cc: Namhyung Kim Cc: stable@vger.kernel.org # v4.14+ --- fs/kernfs/dir.c | 5 ++--- include/linux/kernfs.h | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 6ebae6bbe6a5..7d4af6cea2a6 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -622,7 +622,6 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, { struct kernfs_node *kn; u32 gen; - int cursor; int ret; name = kstrdup_const(name, GFP_KERNEL); @@ -635,11 +634,11 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, idr_preload(GFP_KERNEL); spin_lock(&kernfs_idr_lock); - cursor = idr_get_cursor(&root->ino_idr); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); - if (ret >= 0 && ret < cursor) + if (ret >= 0 && ret < root->last_ino) root->next_generation++; gen = root->next_generation; + root->last_ino = ret; spin_unlock(&kernfs_idr_lock); idr_preload_end(); if (ret < 0) diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 936b61bd504e..f797ccc650e7 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -187,6 +187,7 @@ struct kernfs_root { /* private fields, do not use outside kernfs proper */ struct idr ino_idr; + u32 last_ino; u32 next_generation; struct kernfs_syscall_ops *syscall_ops; From f05499a06fb4348b935dd2ffc2c86a76705486be Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:29 -0800 Subject: [PATCH 13/24] writeback: use ino_t for inodes in tracepoints Writeback TPs currently use mix of 32 and 64bits for inos. This isn't currently broken because only cgroup inos are using 32bits and they're limited to 32bits. cgroup inos will make use of 64bits. Let's uniformly use ino_t. While at it, switch the default cgroup ino value used when cgroup is disabled to 1 instead of -1U as root cgroup always uses ino 1. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Reviewed-by: Greg Kroah-Hartman Cc: Jens Axboe Cc: Namhyung Kim --- include/trace/events/writeback.h | 88 ++++++++++++++++---------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index c2ce6480b4b1..95e50677476b 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -61,7 +61,7 @@ DECLARE_EVENT_CLASS(writeback_page_template, TP_STRUCT__entry ( __array(char, name, 32) - __field(unsigned long, ino) + __field(ino_t, ino) __field(pgoff_t, index) ), @@ -102,7 +102,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_STRUCT__entry ( __array(char, name, 32) - __field(unsigned long, ino) + __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, flags) ), @@ -150,28 +150,28 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, #ifdef CREATE_TRACE_POINTS #ifdef CONFIG_CGROUP_WRITEBACK -static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb) +static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return wb->memcg_css->cgroup->kn->id.ino; } -static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc) +static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { if (wbc->wb) return __trace_wb_assign_cgroup(wbc->wb); else - return -1U; + return 1; } #else /* CONFIG_CGROUP_WRITEBACK */ -static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb) +static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { - return -1U; + return 1; } -static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc) +static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { - return -1U; + return 1; } #endif /* CONFIG_CGROUP_WRITEBACK */ @@ -187,8 +187,8 @@ TRACE_EVENT(inode_foreign_history, TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned long, ino) - __field(unsigned int, cgroup_ino) + __field(ino_t, ino) + __field(ino_t, cgroup_ino) __field(unsigned int, history) ), @@ -199,7 +199,7 @@ TRACE_EVENT(inode_foreign_history, __entry->history = history; ), - TP_printk("bdi %s: ino=%lu cgroup_ino=%u history=0x%x", + TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x", __entry->name, __entry->ino, __entry->cgroup_ino, @@ -216,9 +216,9 @@ TRACE_EVENT(inode_switch_wbs, TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned long, ino) - __field(unsigned int, old_cgroup_ino) - __field(unsigned int, new_cgroup_ino) + __field(ino_t, ino) + __field(ino_t, old_cgroup_ino) + __field(ino_t, new_cgroup_ino) ), TP_fast_assign( @@ -228,7 +228,7 @@ TRACE_EVENT(inode_switch_wbs, __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); ), - TP_printk("bdi %s: ino=%lu old_cgroup_ino=%u new_cgroup_ino=%u", + TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu", __entry->name, __entry->ino, __entry->old_cgroup_ino, @@ -245,10 +245,10 @@ TRACE_EVENT(track_foreign_dirty, TP_STRUCT__entry( __array(char, name, 32) __field(u64, bdi_id) - __field(unsigned long, ino) + __field(ino_t, ino) __field(unsigned int, memcg_id) - __field(unsigned int, cgroup_ino) - __field(unsigned int, page_cgroup_ino) + __field(ino_t, cgroup_ino) + __field(ino_t, page_cgroup_ino) ), TP_fast_assign( @@ -263,7 +263,7 @@ TRACE_EVENT(track_foreign_dirty, __entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino; ), - TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%u page_cgroup_ino=%u", + TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", __entry->name, __entry->bdi_id, __entry->ino, @@ -282,7 +282,7 @@ TRACE_EVENT(flush_foreign, TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) __field(unsigned int, frn_bdi_id) __field(unsigned int, frn_memcg_id) ), @@ -294,7 +294,7 @@ TRACE_EVENT(flush_foreign, __entry->frn_memcg_id = frn_memcg_id; ), - TP_printk("bdi %s: cgroup_ino=%u frn_bdi_id=%u frn_memcg_id=%u", + TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u", __entry->name, __entry->cgroup_ino, __entry->frn_bdi_id, @@ -311,9 +311,9 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_STRUCT__entry ( __array(char, name, 32) - __field(unsigned long, ino) + __field(ino_t, ino) __field(int, sync_mode) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -324,7 +324,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), - TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%u", + TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu", __entry->name, __entry->ino, __entry->sync_mode, @@ -358,7 +358,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __field(int, range_cyclic) __field(int, for_background) __field(int, reason) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, @@ -374,7 +374,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " - "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%u", + "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, @@ -413,13 +413,13 @@ DECLARE_EVENT_CLASS(writeback_class, TP_ARGS(wb), TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), - TP_printk("bdi %s: cgroup_ino=%u", + TP_printk("bdi %s: cgroup_ino=%lu", __entry->name, __entry->cgroup_ino ) @@ -459,7 +459,7 @@ DECLARE_EVENT_CLASS(wbc_class, __field(int, range_cyclic) __field(long, range_start) __field(long, range_end) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -478,7 +478,7 @@ DECLARE_EVENT_CLASS(wbc_class, TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " "bgrd=%d reclm=%d cyclic=%d " - "start=0x%lx end=0x%lx cgroup_ino=%u", + "start=0x%lx end=0x%lx cgroup_ino=%lu", __entry->name, __entry->nr_to_write, __entry->pages_skipped, @@ -510,7 +510,7 @@ TRACE_EVENT(writeback_queue_io, __field(long, age) __field(int, moved) __field(int, reason) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( unsigned long *older_than_this = work->older_than_this; @@ -522,7 +522,7 @@ TRACE_EVENT(writeback_queue_io, __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), - TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%u", + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu", __entry->name, __entry->older, /* older_than_this in jiffies */ __entry->age, /* older_than_this in relative milliseconds */ @@ -596,7 +596,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -614,7 +614,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " - "balanced_dirty_ratelimit=%lu cgroup_ino=%u", + "balanced_dirty_ratelimit=%lu cgroup_ino=%lu", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ @@ -660,7 +660,7 @@ TRACE_EVENT(balance_dirty_pages, __field( long, pause) __field(unsigned long, period) __field( long, think) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -692,7 +692,7 @@ TRACE_EVENT(balance_dirty_pages, "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " - "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%u", + "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu", __entry->bdi, __entry->limit, __entry->setpoint, @@ -718,10 +718,10 @@ TRACE_EVENT(writeback_sb_inodes_requeue, TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned long, ino) + __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -733,7 +733,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue, __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), - TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%u", + TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu", __entry->name, __entry->ino, show_inode_state(__entry->state), @@ -789,13 +789,13 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned long, ino) + __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(unsigned long, writeback_index) __field(long, nr_to_write) __field(unsigned long, wrote) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -811,7 +811,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " - "index=%lu to_write=%ld wrote=%lu cgroup_ino=%u", + "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu", __entry->name, __entry->ino, show_inode_state(__entry->state), @@ -845,7 +845,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template, TP_STRUCT__entry( __field( dev_t, dev ) - __field(unsigned long, ino ) + __field( ino_t, ino ) __field(unsigned long, state ) __field( __u16, mode ) __field(unsigned long, dirtied_when ) From db53c73a8b5db120cb741d7d932cdf831a576e8f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:29 -0800 Subject: [PATCH 14/24] netprio: use css ID instead of cgroup ID netprio uses cgroup ID to index the priority mapping table. This is currently okay as cgroup IDs are allocated using idr and packed. However, cgroup IDs will be changed to use full 64bit range and won't be packed making this impractical. netprio doesn't care what type of IDs it uses as long as they can identify the controller instances and are packed. Let's switch to css IDs instead of cgroup IDs. Signed-off-by: Tejun Heo Acked-by: Neil Horman Reviewed-by: Greg Kroah-Hartman Cc: "David S. Miller" Cc: Namhyung Kim --- include/net/netprio_cgroup.h | 2 +- net/core/netprio_cgroup.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index cfc9441ef074..dec7522b6ce1 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -26,7 +26,7 @@ static inline u32 task_netprioidx(struct task_struct *p) rcu_read_lock(); css = task_css(p, net_prio_cgrp_id); - idx = css->cgroup->id; + idx = css->id; rcu_read_unlock(); return idx; } diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 256b7954b720..8881dd943dd0 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -93,7 +93,7 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx) static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) { struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); - int id = css->cgroup->id; + int id = css->id; if (map && id < map->priomap_len) return map->priomap[id]; @@ -113,7 +113,7 @@ static int netprio_set_prio(struct cgroup_subsys_state *css, struct net_device *dev, u32 prio) { struct netprio_map *map; - int id = css->cgroup->id; + int id = css->id; int ret; /* avoid extending priomap for zero writes */ @@ -177,7 +177,7 @@ static void cgrp_css_free(struct cgroup_subsys_state *css) static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) { - return css->cgroup->id; + return css->id; } static int read_priomap(struct seq_file *sf, void *v) @@ -237,7 +237,7 @@ static void net_prio_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; cgroup_taskset_for_each(p, css, tset) { - void *v = (void *)(unsigned long)css->cgroup->id; + void *v = (void *)(unsigned long)css->id; task_lock(p); iterate_fd(p->files, 0, update_netprio, v); From b680b08171ebf890a4ebb7f82ada9959f4534ade Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:29 -0800 Subject: [PATCH 15/24] kernfs: use dumber locking for kernfs_find_and_get_node_by_ino() kernfs_find_and_get_node_by_ino() uses RCU protection. It's currently a bit buggy because it can look up a node which hasn't been activated yet and thus may end up exposing a node that the kernfs user is still prepping. While it can be fixed by pushing it further in the current direction, it's already complicated and isn't clear whether the complexity is justified. The main use of kernfs_find_and_get_node_by_ino() is for exportfs operations. They aren't super hot and all the follow-up operations (e.g. mapping to path) use normal locking anyway. Let's switch to a dumber locking scheme and protect the lookup with kernfs_idr_lock. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim --- fs/kernfs/dir.c | 45 +++++++++------------------------------------ fs/kernfs/mount.c | 11 +---------- 2 files changed, 10 insertions(+), 46 deletions(-) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 7d4af6cea2a6..798f0f03b62b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -508,10 +508,6 @@ void kernfs_put(struct kernfs_node *kn) struct kernfs_node *parent; struct kernfs_root *root; - /* - * kernfs_node is freed with ->count 0, kernfs_find_and_get_node_by_ino - * depends on this to filter reused stale node - */ if (!kn || !atomic_dec_and_test(&kn->count)) return; root = kernfs_root(kn); @@ -646,11 +642,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, kn->id.ino = ret; kn->id.generation = gen; - /* - * set ino first. This RELEASE is paired with atomic_inc_not_zero in - * kernfs_find_and_get_node_by_ino - */ - atomic_set_release(&kn->count, 1); + atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); @@ -716,38 +708,19 @@ struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, { struct kernfs_node *kn; - rcu_read_lock(); + spin_lock(&kernfs_idr_lock); + kn = idr_find(&root->ino_idr, ino); if (!kn) - goto out; + goto err_unlock; - /* - * Since kernfs_node is freed in RCU, it's possible an old node for ino - * is freed, but reused before RCU grace period. But a freed node (see - * kernfs_put) or an incompletedly initialized node (see - * __kernfs_new_node) should have 'count' 0. We can use this fact to - * filter out such node. - */ - if (!atomic_inc_not_zero(&kn->count)) { - kn = NULL; - goto out; - } - - /* - * The node could be a new node or a reused node. If it's a new node, - * we are ok. If it's reused because of RCU (because of - * SLAB_TYPESAFE_BY_RCU), the __kernfs_new_node always sets its 'ino' - * before 'count'. So if 'count' is uptodate, 'ino' should be uptodate, - * hence we can use 'ino' to filter stale node. - */ - if (kn->id.ino != ino) - goto out; - rcu_read_unlock(); + if (unlikely(!atomic_inc_not_zero(&kn->count))) + goto err_unlock; + spin_unlock(&kernfs_idr_lock); return kn; -out: - rcu_read_unlock(); - kernfs_put(kn); +err_unlock: + spin_unlock(&kernfs_idr_lock); return NULL; } diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 6c12fac2c287..067b7c380056 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -363,18 +363,9 @@ void kernfs_kill_sb(struct super_block *sb) void __init kernfs_init(void) { - - /* - * the slab is freed in RCU context, so kernfs_find_and_get_node_by_ino - * can access the slab lock free. This could introduce stale nodes, - * please see how kernfs_find_and_get_node_by_ino filters out stale - * nodes. - */ kernfs_node_cache = kmem_cache_create("kernfs_node_cache", sizeof(struct kernfs_node), - 0, - SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, - NULL); + 0, SLAB_PANIC, NULL); /* Creates slab cache for kernfs inode attributes */ kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache", From 880df131617393252f1fff701ed5b7b6d14c52c4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:29 -0800 Subject: [PATCH 16/24] kernfs: kernfs_find_and_get_node_by_ino() should only look up activated nodes kernfs node can be created in two separate steps - allocation and activation. This is used to make kernfs nodes visible only after the internal states attached to the node are fully initialized. kernfs_find_and_get_node_by_id() currently allows lookups of nodes which aren't activated yet and thus can expose nodes are which are still being prepped by kernfs users. Fix it by disallowing lookups of nodes which aren't activated yet. kernfs_find_and_get_node_by_ino() Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim --- fs/kernfs/dir.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 798f0f03b62b..beabb585a7d8 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -714,7 +714,13 @@ struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, if (!kn) goto err_unlock; - if (unlikely(!atomic_inc_not_zero(&kn->count))) + /* + * ACTIVATED is protected with kernfs_mutex but it was clear when + * @kn was added to idr and we just wanna see it set. No need to + * grab kernfs_mutex. + */ + if (unlikely(!(kn->flags & KERNFS_ACTIVATED) || + !atomic_inc_not_zero(&kn->count))) goto err_unlock; spin_unlock(&kernfs_idr_lock); From 67c0496e87d193b8356d2af49ab95e8a1b954b3c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: [PATCH 17/24] kernfs: convert kernfs_node->id from union kernfs_node_id to u64 kernfs_node->id is currently a union kernfs_node_id which represents either a 32bit (ino, gen) pair or u64 value. I can't see much value in the usage of the union - all that's needed is a 64bit ID which the current code is already limited to. Using a union makes the code unnecessarily complicated and prevents using 64bit ino without adding practical benefits. This patch drops union kernfs_node_id and makes kernfs_node->id a u64. ino is stored in the lower 32bits and gen upper. Accessors - kernfs[_id]_ino() and kernfs[_id]_gen() - are added to retrieve the ino and gen. This simplifies ID handling less cumbersome and will allow using 64bit inos on supported archs. This patch doesn't make any functional changes. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim Cc: Jens Axboe Cc: Alexei Starovoitov --- fs/kernfs/dir.c | 10 ++--- fs/kernfs/file.c | 4 +- fs/kernfs/inode.c | 4 +- fs/kernfs/mount.c | 7 ++-- include/linux/cgroup.h | 17 ++++---- include/linux/kernfs.h | 45 ++++++++++++--------- include/trace/events/writeback.h | 4 +- kernel/bpf/helpers.c | 2 +- kernel/bpf/local_storage.c | 2 +- kernel/cgroup/cgroup.c | 3 +- kernel/trace/blktrace.c | 67 +++++++++++++++----------------- net/core/filter.c | 4 +- 12 files changed, 85 insertions(+), 84 deletions(-) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index beabb585a7d8..c67afb591e5b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -532,7 +532,7 @@ void kernfs_put(struct kernfs_node *kn) kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } spin_lock(&kernfs_idr_lock); - idr_remove(&root->ino_idr, kn->id.ino); + idr_remove(&root->ino_idr, kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); kmem_cache_free(kernfs_node_cache, kn); @@ -639,8 +639,8 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, idr_preload_end(); if (ret < 0) goto err_out2; - kn->id.ino = ret; - kn->id.generation = gen; + + kn->id = (u64)gen << 32 | ret; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); @@ -671,7 +671,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, return kn; err_out3: - idr_remove(&root->ino_idr, kn->id.ino); + idr_remove(&root->ino_idr, kernfs_ino(kn)); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: @@ -1656,7 +1656,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) const char *name = pos->name; unsigned int type = dt_type(pos); int len = strlen(name); - ino_t ino = pos->id.ino; + ino_t ino = kernfs_ino(pos); ctx->pos = pos->hash; file->private_data = pos; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e8c792b49616..34366db3620d 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -892,7 +892,7 @@ repeat: * have the matching @file available. Look up the inodes * and generate the events manually. */ - inode = ilookup(info->sb, kn->id.ino); + inode = ilookup(info->sb, kernfs_ino(kn)); if (!inode) continue; @@ -901,7 +901,7 @@ repeat: if (parent) { struct inode *p_inode; - p_inode = ilookup(info->sb, parent->id.ino); + p_inode = ilookup(info->sb, kernfs_ino(parent)); if (p_inode) { fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, &name, 0); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index f3eaa8869f42..eac277c63d42 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -201,7 +201,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) inode->i_private = kn; inode->i_mapping->a_ops = &kernfs_aops; inode->i_op = &kernfs_iops; - inode->i_generation = kn->id.generation; + inode->i_generation = kernfs_gen(kn); set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); @@ -247,7 +247,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { struct inode *inode; - inode = iget_locked(sb, kn->id.ino); + inode = iget_locked(sb, kernfs_ino(kn)); if (inode && (inode->i_state & I_NEW)) kernfs_init_inode(kn, inode); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 067b7c380056..f05d5d6f926d 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -57,15 +57,14 @@ const struct super_operations kernfs_sops = { * Similar to kernfs_fh_get_inode, this one gets kernfs node from inode * number and generation */ -struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, - const union kernfs_node_id *id) +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, u64 id) { struct kernfs_node *kn; - kn = kernfs_find_and_get_node_by_ino(root, id->ino); + kn = kernfs_find_and_get_node_by_ino(root, kernfs_id_ino(id)); if (!kn) return NULL; - if (kn->id.generation != id->generation) { + if (kernfs_gen(kn) != kernfs_id_gen(id)) { kernfs_put(kn); return NULL; } diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f6b048902d6c..815fff49d555 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -616,7 +616,7 @@ static inline bool cgroup_is_populated(struct cgroup *cgrp) /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { - return cgrp->kn->id.ino; + return kernfs_ino(cgrp->kn); } /* cft/css accessors for cftype->write() operation */ @@ -687,13 +687,12 @@ static inline void cgroup_kthread_ready(void) current->no_cgroup_migration = 0; } -static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) +static inline u64 cgroup_get_kernfs_id(struct cgroup *cgrp) { - return &cgrp->kn->id; + return cgrp->kn->id; } -void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen); +void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; @@ -718,9 +717,9 @@ static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} -static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) +static inline union u64 cgroup_get_kernfs_id(struct cgroup *cgrp) { - return NULL; + return 0; } static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) @@ -739,8 +738,8 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, return true; } -static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen) {} +static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) +{} #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index f797ccc650e7..b2fc5c8ef6d9 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -104,21 +104,6 @@ struct kernfs_elem_attr { struct kernfs_node *notify_next; /* for kernfs_notify() */ }; -/* represent a kernfs node */ -union kernfs_node_id { - struct { - /* - * blktrace will export this struct as a simplified 'struct - * fid' (which is a big data struction), so userspace can use - * it to find kernfs node. The layout must match the first two - * fields of 'struct fid' exactly. - */ - u32 ino; - u32 generation; - }; - u64 id; -}; - /* * kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs node is represented by single kernfs_node. Most fields are @@ -155,7 +140,12 @@ struct kernfs_node { void *priv; - union kernfs_node_id id; + /* + * 64bit unique ID. Lower 32bits carry the inode number and lower + * generation. + */ + u64 id; + unsigned short flags; umode_t mode; struct kernfs_iattrs *iattr; @@ -292,6 +282,26 @@ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) return kn->flags & KERNFS_TYPE_MASK; } +static inline ino_t kernfs_id_ino(u64 id) +{ + return (u32)id; +} + +static inline u32 kernfs_id_gen(u64 id) +{ + return id >> 32; +} + +static inline ino_t kernfs_ino(struct kernfs_node *kn) +{ + return kernfs_id_ino(kn->id); +} + +static inline ino_t kernfs_gen(struct kernfs_node *kn) +{ + return kernfs_id_gen(kn->id); +} + /** * kernfs_enable_ns - enable namespace under a directory * @kn: directory of interest, should be empty @@ -383,8 +393,7 @@ void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); -struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, - const union kernfs_node_id *id); +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, u64 id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 95e50677476b..b4f0ffe1817e 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -152,7 +152,7 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { - return wb->memcg_css->cgroup->kn->id.ino; + return cgroup_ino(wb->memcg_css->cgroup); } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) @@ -260,7 +260,7 @@ TRACE_EVENT(track_foreign_dirty, __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); - __entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino; + __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5e28718928ca..912e761cd17a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -317,7 +317,7 @@ BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp = task_dfl_cgroup(current); - return cgrp->kn->id.id; + return cgrp->kn->id; } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index addd6fdceec8..5d867f6d7204 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -569,7 +569,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, return; storage->key.attach_type = type; - storage->key.cgroup_inode_id = cgroup->kn->id.id; + storage->key.cgroup_inode_id = cgroup->kn->id; map = storage->map; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cf32c0c7a45d..c6bd1a5a1977 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5786,8 +5786,7 @@ static int __init cgroup_wq_init(void) } core_initcall(cgroup_wq_init); -void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen) +void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) { struct kernfs_node *kn; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2d6e93ab0478..a986d2e74ca2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -64,8 +64,7 @@ static void blk_unregister_tracepoints(void); * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len, - union kernfs_node_id *cgid) + const void *data, size_t len, u64 cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; @@ -73,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; - ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (blk_tracer) { buffer = blk_tr->trace_buffer.buffer; @@ -100,8 +99,8 @@ record_it: t->pid = pid; t->cpu = cpu; t->pdu_len = len + cgid_len; - if (cgid) - memcpy((void *)t + sizeof(*t), cgid, cgid_len); + if (cgid_len) + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) @@ -122,7 +121,7 @@ static void trace_note_tsk(struct task_struct *tsk) spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, - sizeof(tsk->comm), NULL); + sizeof(tsk->comm), 0); } spin_unlock_irqrestore(&running_trace_lock, flags); } @@ -139,7 +138,7 @@ static void trace_note_time(struct blk_trace *bt) words[1] = now.tv_nsec; local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0); local_irq_restore(flags); } @@ -172,9 +171,9 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, blkcg = NULL; #ifdef CONFIG_BLK_CGROUP trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL); + blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : 0); #else - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); #endif local_irq_restore(flags); } @@ -212,7 +211,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data, union kernfs_node_id *cgid) + void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -223,7 +222,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid_t pid; int cpu, pc = 0; bool blk_tracer = blk_tracer_enabled; - ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -294,7 +293,7 @@ record_it: t->pdu_len = pdu_len + cgid_len; if (cgid_len) - memcpy((void *)t + sizeof(*t), cgid, cgid_len); + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); if (pdu_len) memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); @@ -751,31 +750,29 @@ void blk_trace_shutdown(struct request_queue *q) } #ifdef CONFIG_BLK_CGROUP -static union kernfs_node_id * -blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { struct blk_trace *bt = q->blk_trace; if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) - return NULL; + return 0; if (!bio->bi_blkg) - return NULL; + return 0; return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); } #else -static union kernfs_node_id * -blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { - return NULL; + return 0; } #endif -static union kernfs_node_id * +static u64 blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) { if (!rq->bio) - return NULL; + return 0; /* Use the first bio */ return blk_trace_bio_get_cgid(q, rq->bio); } @@ -797,8 +794,7 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) * **/ static void blk_add_trace_rq(struct request *rq, int error, - unsigned int nr_bytes, u32 what, - union kernfs_node_id *cgid) + unsigned int nr_bytes, u32 what, u64 cgid) { struct blk_trace *bt = rq->q->blk_trace; @@ -913,7 +909,7 @@ static void blk_add_trace_getrq(void *ignore, if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL, NULL); + NULL, 0); } } @@ -929,7 +925,7 @@ static void blk_add_trace_sleeprq(void *ignore, if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, - 0, 0, NULL, NULL); + 0, 0, NULL, 0); } } @@ -938,7 +934,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -955,7 +951,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } } @@ -1172,19 +1168,17 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) { - return (void *)(te_blk_io_trace(ent) + 1) + - (has_cg ? sizeof(union kernfs_node_id) : 0); + return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0); } -static inline const void *cgid_start(const struct trace_entry *ent) +static inline u64 t_cgid(const struct trace_entry *ent) { - return (void *)(te_blk_io_trace(ent) + 1); + return *(u64 *)(te_blk_io_trace(ent) + 1); } static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) { - return te_blk_io_trace(ent)->pdu_len - - (has_cg ? sizeof(union kernfs_node_id) : 0); + return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0); } static inline u32 t_action(const struct trace_entry *ent) @@ -1257,7 +1251,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, fill_rwbs(rwbs, t); if (has_cg) { - const union kernfs_node_id *id = cgid_start(iter->ent); + u64 id = t_cgid(iter->ent); if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { char blkcg_name_buf[NAME_MAX + 1] = "<...>"; @@ -1269,9 +1263,10 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, blkcg_name_buf, act, rwbs); } else trace_seq_printf(&iter->seq, - "%3d,%-3d %x,%-x %2s %3s ", + "%3d,%-3d %lx,%-x %2s %3s ", MAJOR(t->device), MINOR(t->device), - id->ino, id->generation, act, rwbs); + kernfs_id_ino(id), kernfs_id_gen(id), + act, rwbs); } else trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); diff --git a/net/core/filter.c b/net/core/filter.c index ed6563622ce3..b360a7beb6fc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4089,7 +4089,7 @@ BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return cgrp->kn->id.id; + return cgrp->kn->id; } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4114,7 +4114,7 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, if (!ancestor) return 0; - return ancestor->kn->id.id; + return ancestor->kn->id; } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { From fe0f726c9fb626b1092a9ea3bf75f57f2eed676e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: [PATCH 18/24] kernfs: combine ino/id lookup functions into kernfs_find_and_get_node_by_id() kernfs_find_and_get_node_by_ino() looks the kernfs_node matching the specified ino. On top of that, kernfs_get_node_by_id() and kernfs_fh_get_inode() implement full ID matching by testing the rest of ID. On surface, confusingly, the two are slightly different in that the latter uses 0 gen as wildcard while the former doesn't - does it mean that the latter can't uniquely identify inodes w/ 0 gen? In practice, this is a distinction without a difference because generation number starts at 1. There are no actual IDs with 0 gen, so it can always safely used as wildcard. Let's simplify the code by renaming kernfs_find_and_get_node_by_ino() to kernfs_find_and_get_node_by_id(), moving all lookup logics into it, and removing now unnecessary kernfs_get_node_by_id(). Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman --- fs/kernfs/dir.c | 17 +++++++++++++---- fs/kernfs/kernfs-internal.h | 2 -- fs/kernfs/mount.c | 26 ++------------------------ include/linux/kernfs.h | 3 ++- kernel/cgroup/cgroup.c | 2 +- 5 files changed, 18 insertions(+), 32 deletions(-) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index c67afb591e5b..5dcf19d4adbc 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -696,17 +696,22 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, } /* - * kernfs_find_and_get_node_by_ino - get kernfs_node from inode number + * kernfs_find_and_get_node_by_id - get kernfs_node from node id * @root: the kernfs root - * @ino: inode number + * @id: the target node id + * + * @id's lower 32bits encode ino and upper gen. If the gen portion is + * zero, all generations are matched. * * RETURNS: * NULL on failure. Return a kernfs node with reference counter incremented */ -struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, - unsigned int ino) +struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, + u64 id) { struct kernfs_node *kn; + ino_t ino = kernfs_id_ino(id); + u32 gen = kernfs_id_gen(id); spin_lock(&kernfs_idr_lock); @@ -714,6 +719,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, if (!kn) goto err_unlock; + /* 0 matches all generations */ + if (unlikely(gen && kernfs_gen(kn) != gen)) + goto err_unlock; + /* * ACTIVATED is protected with kernfs_mutex but it was clear when * @kn was added to idr and we just wanna see it set. No need to diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 02ce570a9a3c..2f3c51d55261 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -109,8 +109,6 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags); -struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, - unsigned int ino); /* * file.c diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index f05d5d6f926d..8aed2cccd002 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -53,24 +53,6 @@ const struct super_operations kernfs_sops = { .show_path = kernfs_sop_show_path, }; -/* - * Similar to kernfs_fh_get_inode, this one gets kernfs node from inode - * number and generation - */ -struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, u64 id) -{ - struct kernfs_node *kn; - - kn = kernfs_find_and_get_node_by_ino(root, kernfs_id_ino(id)); - if (!kn) - return NULL; - if (kernfs_gen(kn) != kernfs_id_gen(id)) { - kernfs_put(kn); - return NULL; - } - return kn; -} - static struct inode *kernfs_fh_get_inode(struct super_block *sb, u64 ino, u32 generation) { @@ -81,7 +63,8 @@ static struct inode *kernfs_fh_get_inode(struct super_block *sb, if (ino == 0) return ERR_PTR(-ESTALE); - kn = kernfs_find_and_get_node_by_ino(info->root, ino); + kn = kernfs_find_and_get_node_by_id(info->root, + ino | ((u64)generation << 32)); if (!kn) return ERR_PTR(-ESTALE); inode = kernfs_get_inode(sb, kn); @@ -89,11 +72,6 @@ static struct inode *kernfs_fh_get_inode(struct super_block *sb, if (!inode) return ERR_PTR(-ESTALE); - if (generation && inode->i_generation != generation) { - /* we didn't find the right inode.. */ - iput(inode); - return ERR_PTR(-ESTALE); - } return inode; } diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index b2fc5c8ef6d9..38267cc9420c 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -393,7 +393,8 @@ void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); -struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, u64 id); +struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, + u64 id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c6bd1a5a1977..b5dcbee5aa6c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5790,7 +5790,7 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) { struct kernfs_node *kn; - kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); + kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) return; kernfs_path(kn, buf, buflen); From 33c5ac9175195c36a0b7005aaf503a2e81f117a1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: [PATCH 19/24] kernfs: implement custom exportfs ops and fid type The current kernfs exportfs implementation uses the generic_fh_*() helpers and FILEID_INO32_GEN[_PARENT] which limits ino to 32bits. Let's implement custom exportfs operations and fid type to remove the restriction. * FILEID_KERNFS is a single u64 value whose content is kernfs_node->id. This is the only native fid type. * For backward compatibility with blk_log_action() path which exposes (ino,gen) pairs which userland assembles into FILEID_INO32_GEN keys, combine the generic keys into 64bit IDs in the same order. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim --- fs/kernfs/mount.c | 77 +++++++++++++++++++++++++++++++--------- include/linux/exportfs.h | 5 +++ 2 files changed, 66 insertions(+), 16 deletions(-) diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 8aed2cccd002..37a1e5df117a 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -53,40 +53,84 @@ const struct super_operations kernfs_sops = { .show_path = kernfs_sop_show_path, }; -static struct inode *kernfs_fh_get_inode(struct super_block *sb, - u64 ino, u32 generation) +static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, + struct inode *parent) +{ + struct kernfs_node *kn = inode->i_private; + + if (*max_len < 2) { + *max_len = 2; + return FILEID_INVALID; + } + + *max_len = 2; + *(u64 *)fh = kn->id; + return FILEID_KERNFS; +} + +static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type, bool get_parent) { struct kernfs_super_info *info = kernfs_info(sb); - struct inode *inode; struct kernfs_node *kn; + struct inode *inode; + u64 id; - if (ino == 0) - return ERR_PTR(-ESTALE); + if (fh_len < 2) + return NULL; - kn = kernfs_find_and_get_node_by_id(info->root, - ino | ((u64)generation << 32)); + switch (fh_type) { + case FILEID_KERNFS: + id = *(u64 *)fid; + break; + case FILEID_INO32_GEN: + case FILEID_INO32_GEN_PARENT: + /* + * blk_log_action() exposes (ino,gen) pair without type and + * userland can call us with generic fid constructed from + * them. Combine it back to ID. See blk_log_action(). + */ + id = ((u64)fid->i32.gen << 32) | fid->i32.ino; + break; + default: + return NULL; + } + + kn = kernfs_find_and_get_node_by_id(info->root, id); if (!kn) return ERR_PTR(-ESTALE); + + if (get_parent) { + struct kernfs_node *parent; + + parent = kernfs_get_parent(kn); + kernfs_put(kn); + kn = parent; + if (!kn) + return ERR_PTR(-ESTALE); + } + inode = kernfs_get_inode(sb, kn); kernfs_put(kn); if (!inode) return ERR_PTR(-ESTALE); - return inode; + return d_obtain_alias(inode); } -static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) +static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type) { - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - kernfs_fh_get_inode); + return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, false); } -static struct dentry *kernfs_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) +static struct dentry *kernfs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type) { - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - kernfs_fh_get_inode); + return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, true); } static struct dentry *kernfs_get_parent_dentry(struct dentry *child) @@ -97,6 +141,7 @@ static struct dentry *kernfs_get_parent_dentry(struct dentry *child) } static const struct export_operations kernfs_export_ops = { + .encode_fh = kernfs_encode_fh, .fh_to_dentry = kernfs_fh_to_dentry, .fh_to_parent = kernfs_fh_to_parent, .get_parent = kernfs_get_parent_dentry, diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index cf6571fc9c01..d896b8657085 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -104,6 +104,11 @@ enum fid_type { */ FILEID_LUSTRE = 0x97, + /* + * 64 bit unique kernfs id + */ + FILEID_KERNFS = 0xfe, + /* * Filesystems must not use 0xff file ID. */ From 40430452fd5da1509177ac597b394614cd3a121f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: [PATCH 20/24] kernfs: use 64bit inos if ino_t is 64bit Each kernfs_node is identified with a 64bit ID. The low 32bit is exposed as ino and the high gen. While this already allows using inos as keys by looking up with wildcard generation number of 0, it's adding unnecessary complications for 64bit ino archs which can directly use kernfs_node IDs as inos to uniquely identify each cgroup instance. This patch exposes IDs directly as inos on 64bit ino archs. The conversion is mostly straight-forward. * 32bit ino archs behave the same as before. 64bit ino archs now use the whole 64bit ID as ino and the generation number is fixed at 1. * 64bit inos still use the same idr allocator which gurantees that the lower 32bits identify the current live instance uniquely and the high 32bits are incremented whenever the low bits wrap. As the upper 32bits are no longer used as gen and we don't wanna start ino allocation with 33rd bit set, the initial value for highbits allocation is changed to 0 on 64bit ino archs. * blktrace exposes two 32bit numbers - (INO,GEN) pair - to identify the issuing cgroup. Userland builds FILEID_INO32_GEN fids from these numbers to look up the cgroups. To remain compatible with the behavior, always output (LOW32,HIGH32) which will be constructed back to the original 64bit ID by __kernfs_fh_to_dentry(). Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim --- fs/kernfs/dir.c | 42 ++++++++++++++++++++++++++++------------- fs/kernfs/mount.c | 7 ++++--- include/linux/kernfs.h | 20 ++++++++++++++------ kernel/trace/blktrace.c | 21 +++++++++++++++++---- 4 files changed, 64 insertions(+), 26 deletions(-) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 5dcf19d4adbc..b2d9f79c4a7c 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -532,7 +532,7 @@ void kernfs_put(struct kernfs_node *kn) kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } spin_lock(&kernfs_idr_lock); - idr_remove(&root->ino_idr, kernfs_ino(kn)); + idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); kmem_cache_free(kernfs_node_cache, kn); @@ -617,7 +617,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, unsigned flags) { struct kernfs_node *kn; - u32 gen; + u32 id_highbits; int ret; name = kstrdup_const(name, GFP_KERNEL); @@ -631,16 +631,16 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, idr_preload(GFP_KERNEL); spin_lock(&kernfs_idr_lock); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); - if (ret >= 0 && ret < root->last_ino) - root->next_generation++; - gen = root->next_generation; - root->last_ino = ret; + if (ret >= 0 && ret < root->last_id_lowbits) + root->id_highbits++; + id_highbits = root->id_highbits; + root->last_id_lowbits = ret; spin_unlock(&kernfs_idr_lock); idr_preload_end(); if (ret < 0) goto err_out2; - kn->id = (u64)gen << 32 | ret; + kn->id = (u64)id_highbits << 32 | ret; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); @@ -671,7 +671,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, return kn; err_out3: - idr_remove(&root->ino_idr, kernfs_ino(kn)); + idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: @@ -715,13 +715,19 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, spin_lock(&kernfs_idr_lock); - kn = idr_find(&root->ino_idr, ino); + kn = idr_find(&root->ino_idr, (u32)ino); if (!kn) goto err_unlock; - /* 0 matches all generations */ - if (unlikely(gen && kernfs_gen(kn) != gen)) - goto err_unlock; + if (sizeof(ino_t) >= sizeof(u64)) { + /* we looked up with the low 32bits, compare the whole */ + if (kernfs_ino(kn) != ino) + goto err_unlock; + } else { + /* 0 matches all generations */ + if (unlikely(gen && kernfs_gen(kn) != gen)) + goto err_unlock; + } /* * ACTIVATED is protected with kernfs_mutex but it was clear when @@ -949,7 +955,17 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, idr_init(&root->ino_idr); INIT_LIST_HEAD(&root->supers); - root->next_generation = 1; + + /* + * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino. + * High bits generation. The starting value for both ino and + * genenration is 1. Initialize upper 32bit allocation + * accordingly. + */ + if (sizeof(ino_t) >= sizeof(u64)) + root->id_highbits = 0; + else + root->id_highbits = 1; kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 37a1e5df117a..4d31503abaee 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -87,9 +87,10 @@ static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb, case FILEID_INO32_GEN: case FILEID_INO32_GEN_PARENT: /* - * blk_log_action() exposes (ino,gen) pair without type and - * userland can call us with generic fid constructed from - * them. Combine it back to ID. See blk_log_action(). + * blk_log_action() exposes "LOW32,HIGH32" pair without + * type and userland can call us with generic fid + * constructed from them. Combine it back to ID. See + * blk_log_action(). */ id = ((u64)fid->i32.gen << 32) | fid->i32.ino; break; diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 38267cc9420c..dded2e5a9f42 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -141,8 +141,8 @@ struct kernfs_node { void *priv; /* - * 64bit unique ID. Lower 32bits carry the inode number and lower - * generation. + * 64bit unique ID. On 64bit ino setups, id is the ino. On 32bit, + * the low 32bits are ino and upper generation. */ u64 id; @@ -177,8 +177,8 @@ struct kernfs_root { /* private fields, do not use outside kernfs proper */ struct idr ino_idr; - u32 last_ino; - u32 next_generation; + u32 last_id_lowbits; + u32 id_highbits; struct kernfs_syscall_ops *syscall_ops; /* list of kernfs_super_info of this root, protected by kernfs_mutex */ @@ -284,12 +284,20 @@ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) static inline ino_t kernfs_id_ino(u64 id) { - return (u32)id; + /* id is ino if ino_t is 64bit; otherwise, low 32bits */ + if (sizeof(ino_t) >= sizeof(u64)) + return id; + else + return (u32)id; } static inline u32 kernfs_id_gen(u64 id) { - return id >> 32; + /* gen is fixed at 1 if ino_t is 64bit; otherwise, high 32bits */ + if (sizeof(ino_t) >= sizeof(u64)) + return 1; + else + return id >> 32; } static inline ino_t kernfs_ino(struct kernfs_node *kn) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a986d2e74ca2..a7dac5b63f3f 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1261,12 +1261,25 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", MAJOR(t->device), MINOR(t->device), blkcg_name_buf, act, rwbs); - } else + } else { + /* + * The cgid portion used to be "INO,GEN". Userland + * builds a FILEID_INO32_GEN fid out of them and + * opens the cgroup using open_by_handle_at(2). + * While 32bit ino setups are still the same, 64bit + * ones now use the 64bit ino as the whole ID and + * no longer use generation. + * + * Regarldess of the content, always output + * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can + * be mapped back to @id on both 64 and 32bit ino + * setups. See __kernfs_fh_to_dentry(). + */ trace_seq_printf(&iter->seq, - "%3d,%-3d %lx,%-x %2s %3s ", + "%3d,%-3d %llx,%-llx %2s %3s ", MAJOR(t->device), MINOR(t->device), - kernfs_id_ino(id), kernfs_id_gen(id), - act, rwbs); + id & U32_MAX, id >> 32, act, rwbs); + } } else trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); From 743210386c0354a2f8ef3d697353c7d8477fa81d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: [PATCH 21/24] cgroup: use cgrp->kn->id as the cgroup ID cgroup ID is currently allocated using a dedicated per-hierarchy idr and used internally and exposed through tracepoints and bpf. This is confusing because there are tracepoints and other interfaces which use the cgroupfs ino as IDs. The preceding changes made kn->id exposed as ino as 64bit ino on supported archs or ino+gen (low 32bits as ino, high gen). There's no reason for cgroup to use different IDs. The kernfs IDs are unique and userland can easily discover them and map them back to paths using standard file operations. This patch replaces cgroup IDs with kernfs IDs. * cgroup_id() is added and all cgroup ID users are converted to use it. * kernfs_node creation is moved to earlier during cgroup init so that cgroup_id() is available during init. * While at it, s/cgroup/cgrp/ in psi helpers for consistency. * Fallback ID value is changed to 1 to be consistent with root cgroup ID. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim --- include/linux/cgroup-defs.h | 17 +------- include/linux/cgroup.h | 17 ++++---- include/trace/events/cgroup.h | 6 +-- kernel/bpf/helpers.c | 2 +- kernel/bpf/local_storage.c | 2 +- kernel/cgroup/cgroup.c | 76 ++++++++++++----------------------- kernel/trace/blktrace.c | 4 +- net/core/filter.c | 4 +- 8 files changed, 43 insertions(+), 85 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4904b1ebd1ff..63097cb243cb 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -354,16 +354,6 @@ struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ - /* - * idr allocated in-hierarchy ID. - * - * ID 0 is not used, the ID of the root cgroup is always 1, and a - * new cgroup will be assigned with a smallest available ID. - * - * Allocating/Removing ID must be protected by cgroup_mutex. - */ - int id; - /* * The depth this cgroup is at. The root is at depth zero and each * step down the hierarchy increments the level. This along with @@ -488,7 +478,7 @@ struct cgroup { struct cgroup_freezer_state freezer; /* ids of the ancestors at each level including self */ - int ancestor_ids[]; + u64 ancestor_ids[]; }; /* @@ -509,7 +499,7 @@ struct cgroup_root { struct cgroup cgrp; /* for cgrp->ancestor_ids[0] */ - int cgrp_ancestor_id_storage; + u64 cgrp_ancestor_id_storage; /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; @@ -520,9 +510,6 @@ struct cgroup_root { /* Hierarchy-specific flags */ unsigned int flags; - /* IDs for cgroups in this hierarchy */ - struct idr cgroup_idr; - /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 815fff49d555..d7ddebd0cdec 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -304,6 +304,11 @@ void css_task_iter_end(struct css_task_iter *it); * Inline functions. */ +static inline u64 cgroup_id(struct cgroup *cgrp) +{ + return cgrp->kn->id; +} + /** * css_get - obtain a reference on the specified css * @css: target css @@ -565,7 +570,7 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, { if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) return false; - return cgrp->ancestor_ids[ancestor->level] == ancestor->id; + return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor); } /** @@ -687,17 +692,13 @@ static inline void cgroup_kthread_ready(void) current->no_cgroup_migration = 0; } -static inline u64 cgroup_get_kernfs_id(struct cgroup *cgrp) -{ - return cgrp->kn->id; -} - void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; struct cgroup; +static inline u64 cgroup_id(struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline int cgroup_attach_task_all(struct task_struct *from, @@ -717,10 +718,6 @@ static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} -static inline union u64 cgroup_get_kernfs_id(struct cgroup *cgrp) -{ - return 0; -} static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h index a566cc521476..7f42a3de59e6 100644 --- a/include/trace/events/cgroup.h +++ b/include/trace/events/cgroup.h @@ -66,7 +66,7 @@ DECLARE_EVENT_CLASS(cgroup, TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; - __entry->id = cgrp->id; + __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); ), @@ -135,7 +135,7 @@ DECLARE_EVENT_CLASS(cgroup_migrate, TP_fast_assign( __entry->dst_root = dst_cgrp->root->hierarchy_id; - __entry->dst_id = dst_cgrp->id; + __entry->dst_id = cgroup_id(dst_cgrp); __entry->dst_level = dst_cgrp->level; __assign_str(dst_path, path); __entry->pid = task->pid; @@ -179,7 +179,7 @@ DECLARE_EVENT_CLASS(cgroup_event, TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; - __entry->id = cgrp->id; + __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); __entry->val = val; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 912e761cd17a..cada974c9f4e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -317,7 +317,7 @@ BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp = task_dfl_cgroup(current); - return cgrp->kn->id; + return cgroup_id(cgrp); } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 5d867f6d7204..2ba750725cb2 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -569,7 +569,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, return; storage->key.attach_type = type; - storage->key.cgroup_inode_id = cgroup->kn->id; + storage->key.cgroup_inode_id = cgroup_id(cgroup); map = storage->map; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b5dcbee5aa6c..c12dcf7dc432 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1308,10 +1308,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - if (root) { - idr_destroy(&root->cgroup_idr); - kfree(root); - } + kfree(root); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1917,7 +1914,6 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); - idr_init(&root->cgroup_idr); root->flags = ctx->flags; if (ctx->release_agent) @@ -1938,12 +1934,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) lockdep_assert_held(&cgroup_mutex); - ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); - if (ret < 0) - goto out; - root_cgrp->id = ret; - root_cgrp->ancestor_ids[0] = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) @@ -1976,6 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto exit_root_id; } root_cgrp->kn = root->kf_root->kn; + WARN_ON_ONCE(cgroup_id(root_cgrp) != 1); + root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); ret = css_populate_dir(&root_cgrp->self); if (ret) @@ -3552,22 +3544,22 @@ static int cpu_stat_show(struct seq_file *seq, void *v) #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_CPU); } @@ -4987,9 +4979,6 @@ static void css_release_work_fn(struct work_struct *work) tcgrp->nr_dying_descendants--; spin_unlock_irq(&css_set_lock); - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - cgrp->id = -1; - /* * There are two control paths which try to determine * cgroup from dentry without going through kernfs - @@ -5154,10 +5143,12 @@ err_free_css: * it isn't associated with its kernfs_node and doesn't have the control * mask applied. */ -static struct cgroup *cgroup_create(struct cgroup *parent) +static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, + umode_t mode) { struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; + struct kernfs_node *kn; int level = parent->level + 1; int ret; @@ -5177,15 +5168,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent) goto out_cancel_ref; } - /* - * Temporarily set the pointer to NULL, so idr_find() won't return - * a half-baked cgroup. - */ - cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); - if (cgrp->id < 0) { - ret = -ENOMEM; + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); goto out_stat_exit; } + cgrp->kn = kn; init_cgroup_housekeeping(cgrp); @@ -5195,7 +5184,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) ret = psi_cgroup_alloc(cgrp); if (ret) - goto out_idr_free; + goto out_kernfs_remove; ret = cgroup_bpf_inherit(cgrp); if (ret) @@ -5219,7 +5208,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { - cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; + cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp); if (tcgrp != cgrp) { tcgrp->nr_descendants++; @@ -5248,12 +5237,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent) atomic_inc(&root->nr_cgrps); cgroup_get_live(parent); - /* - * @cgrp is now fully operational. If something fails after this - * point, it'll be released via the normal destruction path. - */ - cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - /* * On the default hierarchy, a child doesn't automatically inherit * subtree_control from the parent. Each is configured manually. @@ -5267,8 +5250,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) out_psi_free: psi_cgroup_free(cgrp); -out_idr_free: - cgroup_idr_remove(&root->cgroup_idr, cgrp->id); +out_kernfs_remove: + kernfs_remove(cgrp->kn); out_stat_exit: if (cgroup_on_dfl(parent)) cgroup_rstat_exit(cgrp); @@ -5305,7 +5288,6 @@ fail: int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; - struct kernfs_node *kn; int ret; /* do not accept '\n' to prevent making /proc//cgroup unparsable */ @@ -5321,27 +5303,19 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) goto out_unlock; } - cgrp = cgroup_create(parent); + cgrp = cgroup_create(parent, name, mode); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); goto out_unlock; } - /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - goto out_destroy; - } - cgrp->kn = kn; - /* * This extra ref will be put in cgroup_free_fn() and guarantees * that @cgrp->kn is always accessible. */ - kernfs_get(kn); + kernfs_get(cgrp->kn); - ret = cgroup_kn_set_ugid(kn); + ret = cgroup_kn_set_ugid(cgrp->kn); if (ret) goto out_destroy; @@ -5356,7 +5330,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) TRACE_CGROUP_PATH(mkdir, cgrp); /* let's create and online css's */ - kernfs_activate(kn); + kernfs_activate(cgrp->kn); ret = 0; goto out_unlock; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a7dac5b63f3f..475e29498bca 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -171,7 +171,7 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, blkcg = NULL; #ifdef CONFIG_BLK_CGROUP trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : 0); + blkcg ? cgroup_id(blkcg->css.cgroup) : 1); #else trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); #endif @@ -759,7 +759,7 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bio->bi_blkg) return 0; - return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); + return cgroup_id(bio_blkcg(bio)->css.cgroup); } #else u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) diff --git a/net/core/filter.c b/net/core/filter.c index b360a7beb6fc..caef7c74cad5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4089,7 +4089,7 @@ BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return cgrp->kn->id; + return cgroup_id(cgrp); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4114,7 +4114,7 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, if (!ancestor) return 0; - return ancestor->kn->id; + return cgroup_id(ancestor); } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { From d7495343228f30d8206e92dccfd1c41adcfa142d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Nov 2019 14:46:51 -0800 Subject: [PATCH 22/24] cgroup: fix incorrect WARN_ON_ONCE() in cgroup_setup_root() 743210386c03 ("cgroup: use cgrp->kn->id as the cgroup ID") added WARN which triggers if cgroup_id(root_cgrp) is not 1. This is fine on 64bit ino archs but on 32bit archs cgroup ID is ((gen << 32) | ino) and gen starts at 1, so the root id is 0x1_0000_0001 instead of 1 always triggering the WARN. What we wanna make sure is that the ino part is 1. Fix it. Reported-by: Naresh Kamboju Fixes: 743210386c03 ("cgroup: use cgrp->kn->id as the cgroup ID") Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c12dcf7dc432..53098c1d45e2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1966,7 +1966,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto exit_root_id; } root_cgrp->kn = root->kf_root->kn; - WARN_ON_ONCE(cgroup_id(root_cgrp) != 1); + WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); ret = css_populate_dir(&root_cgrp->self); From 03189e8ed5b3f17550b478043f671a9dbd2b3141 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Mon, 11 Nov 2019 14:44:38 +0000 Subject: [PATCH 23/24] docs: cgroup: mm: Fix spelling of "list" Signed-off-by: Chris Down Cc: Andrew Morton Cc: linux-kernel@vger.kernel.org Cc: cgroups@vger.kernel.org Cc: linux-mm@kvack.org Cc: kernel-team@fb.com Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 0fa8c0e615c2..2357aa28ddda 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1328,7 +1328,7 @@ PAGE_SIZE multiple when read back. pgdeactivate - Amount of pages moved to the inactive LRU lis + Amount of pages moved to the inactive LRU list pglazyfree From 40363cf13999ee4fb3b5c1e67fa5e6f0e9da34bd Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 14 Nov 2019 12:17:41 -0500 Subject: [PATCH 24/24] writeback: fix -Wformat compilation warnings The commit f05499a06fb4 ("writeback: use ino_t for inodes in tracepoints") introduced a lot of GCC compilation warnings on s390, In file included from ./include/trace/define_trace.h:102, from ./include/trace/events/writeback.h:904, from fs/fs-writeback.c:82: ./include/trace/events/writeback.h: In function 'trace_raw_output_writeback_page_template': ./include/trace/events/writeback.h:76:12: warning: format '%lu' expects argument of type 'long unsigned int', but argument 4 has type 'ino_t' {aka 'unsigned int'} [-Wformat=] TP_printk("bdi %s: ino=%lu index=%lu", ^~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/trace/trace_events.h:360:22: note: in definition of macro 'DECLARE_EVENT_CLASS' trace_seq_printf(s, print); \ ^~~~~ ./include/trace/events/writeback.h:76:2: note: in expansion of macro 'TP_printk' TP_printk("bdi %s: ino=%lu index=%lu", ^~~~~~~~~ Fix them by adding necessary casts where ino_t could be either "unsigned int" or "unsigned long". Fixes: f05499a06fb4 ("writeback: use ino_t for inodes in tracepoints") Signed-off-by: Qian Cai Signed-off-by: Tejun Heo --- include/trace/events/writeback.h | 48 ++++++++++++++++---------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index b4f0ffe1817e..ef50be4e5e6c 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -75,7 +75,7 @@ DECLARE_EVENT_CLASS(writeback_page_template, TP_printk("bdi %s: ino=%lu index=%lu", __entry->name, - __entry->ino, + (unsigned long)__entry->ino, __entry->index ) ); @@ -120,7 +120,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_printk("bdi %s: ino=%lu state=%s flags=%s", __entry->name, - __entry->ino, + (unsigned long)__entry->ino, show_inode_state(__entry->state), show_inode_state(__entry->flags) ) @@ -201,8 +201,8 @@ TRACE_EVENT(inode_foreign_history, TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x", __entry->name, - __entry->ino, - __entry->cgroup_ino, + (unsigned long)__entry->ino, + (unsigned long)__entry->cgroup_ino, __entry->history ) ); @@ -230,9 +230,9 @@ TRACE_EVENT(inode_switch_wbs, TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu", __entry->name, - __entry->ino, - __entry->old_cgroup_ino, - __entry->new_cgroup_ino + (unsigned long)__entry->ino, + (unsigned long)__entry->old_cgroup_ino, + (unsigned long)__entry->new_cgroup_ino ) ); @@ -266,10 +266,10 @@ TRACE_EVENT(track_foreign_dirty, TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", __entry->name, __entry->bdi_id, - __entry->ino, + (unsigned long)__entry->ino, __entry->memcg_id, - __entry->cgroup_ino, - __entry->page_cgroup_ino + (unsigned long)__entry->cgroup_ino, + (unsigned long)__entry->page_cgroup_ino ) ); @@ -296,7 +296,7 @@ TRACE_EVENT(flush_foreign, TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u", __entry->name, - __entry->cgroup_ino, + (unsigned long)__entry->cgroup_ino, __entry->frn_bdi_id, __entry->frn_memcg_id ) @@ -326,9 +326,9 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu", __entry->name, - __entry->ino, + (unsigned long)__entry->ino, __entry->sync_mode, - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -383,7 +383,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->range_cyclic, __entry->for_background, __print_symbolic(__entry->reason, WB_WORK_REASON), - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ @@ -421,7 +421,7 @@ DECLARE_EVENT_CLASS(writeback_class, ), TP_printk("bdi %s: cgroup_ino=%lu", __entry->name, - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_EVENT(name) \ @@ -489,7 +489,7 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->range_cyclic, __entry->range_start, __entry->range_end, - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ) @@ -528,7 +528,7 @@ TRACE_EVENT(writeback_queue_io, __entry->age, /* older_than_this in relative milliseconds */ __entry->moved, __print_symbolic(__entry->reason, WB_WORK_REASON), - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -622,7 +622,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -707,7 +707,7 @@ TRACE_EVENT(balance_dirty_pages, __entry->pause, /* ms */ __entry->period, /* ms */ __entry->think, /* ms */ - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -735,11 +735,11 @@ TRACE_EVENT(writeback_sb_inodes_requeue, TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu", __entry->name, - __entry->ino, + (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -813,14 +813,14 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu", __entry->name, - __entry->ino, + (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, __entry->wrote, - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -861,7 +861,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template, TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, __entry->dirtied_when, + (unsigned long)__entry->ino, __entry->dirtied_when, show_inode_state(__entry->state), __entry->mode) );