From db8dd9697238be70a6b4f9d0284cd89f59c0e070 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 30 Jan 2020 13:34:49 +0300 Subject: [PATCH 1/5] cgroup-v1: cgroup_pidlist_next should update position index if seq_file .next fuction does not change position index, read after some lseek can generate unexpected output. # mount | grep cgroup # dd if=/mnt/cgroup.procs bs=1 # normal output ... 1294 1295 1296 1304 1382 584+0 records in 584+0 records out 584 bytes copied dd: /mnt/cgroup.procs: cannot skip to specified offset 83 <<< generates end of last line 1383 <<< ... and whole last line once again 0+1 records in 0+1 records out 8 bytes copied dd: /mnt/cgroup.procs: cannot skip to specified offset 1386 <<< generates last line anyway 0+1 records in 0+1 records out 5 bytes copied https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index be1a1c83cdd1..9a6f060cbf51 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -471,6 +471,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) */ p++; if (p >= end) { + (*pos)++; return NULL; } else { *pos = *p; From 2d4ecb030dcc90fb725ecbfc82ce5d6c37906e0e Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 30 Jan 2020 13:34:59 +0300 Subject: [PATCH 2/5] cgroup: cgroup_procs_next should increase position index If seq_file .next fuction does not change position index, read after some lseek can generate unexpected output: 1) dd bs=1 skip output of each 2nd elements $ dd if=/sys/fs/cgroup/cgroup.procs bs=8 count=1 2 3 4 5 1+0 records in 1+0 records out 8 bytes copied, 0,000267297 s, 29,9 kB/s [test@localhost ~]$ dd if=/sys/fs/cgroup/cgroup.procs bs=1 count=8 2 4 <<< NB! 3 was skipped 6 <<< ... and 5 too 8 <<< ... and 7 8+0 records in 8+0 records out 8 bytes copied, 5,2123e-05 s, 153 kB/s This happen because __cgroup_procs_start() makes an extra extra cgroup_procs_next() call 2) read after lseek beyond end of file generates whole last line. 3) read after lseek into middle of last line generates expected rest of last line and unexpected whole line once again. Additionally patch removes an extra position index changes in __cgroup_procs_start() Cc: stable@vger.kernel.org https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 75f687301bbf..927f7b82e5c1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4595,6 +4595,9 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) struct kernfs_open_file *of = s->private; struct css_task_iter *it = of->priv; + if (pos) + (*pos)++; + return css_task_iter_next(it); } @@ -4610,7 +4613,7 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, * from position 0, so we can simply keep iterating on !0 *pos. */ if (!it) { - if (WARN_ON_ONCE((*pos)++)) + if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); it = kzalloc(sizeof(*it), GFP_KERNEL); @@ -4618,10 +4621,11 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, return ERR_PTR(-ENOMEM); of->priv = it; css_task_iter_start(&cgrp->self, iter_flags, it); - } else if (!(*pos)++) { + } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); - } + } else + return it->cur_task; return cgroup_procs_next(s, NULL, NULL); } From 9c974c77246460fa6a92c18554c3311c8c83c160 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 24 Jan 2020 12:40:15 +0100 Subject: [PATCH 3/5] cgroup: Iterate tasks that did not finish do_exit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PF_EXITING is set earlier than actual removal from css_set when a task is exitting. This can confuse cgroup.procs readers who see no PF_EXITING tasks, however, rmdir is checking against css_set membership so it can transitionally fail with EBUSY. Fix this by listing tasks that weren't unlinked from css_set active lists. It may happen that other users of the task iterator (without CSS_TASK_ITER_PROCS) spot a PF_EXITING task before cgroup_exit(). This is equal to the state before commit c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations") but it may be reviewed later. Reported-by: Suren Baghdasaryan Fixes: c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations") Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 23 ++++++++++++++++------- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index d7ddebd0cdec..e75d2191226b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -62,6 +62,7 @@ struct css_task_iter { struct list_head *mg_tasks_head; struct list_head *dying_tasks_head; + struct list_head *cur_tasks_head; struct css_set *cur_cset; struct css_set *cur_dcset; struct task_struct *cur_task; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 927f7b82e5c1..c719a4154d6d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4400,12 +4400,16 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) } } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); - if (!list_empty(&cset->tasks)) + if (!list_empty(&cset->tasks)) { it->task_pos = cset->tasks.next; - else if (!list_empty(&cset->mg_tasks)) + it->cur_tasks_head = &cset->tasks; + } else if (!list_empty(&cset->mg_tasks)) { it->task_pos = cset->mg_tasks.next; - else + it->cur_tasks_head = &cset->mg_tasks; + } else { it->task_pos = cset->dying_tasks.next; + it->cur_tasks_head = &cset->dying_tasks; + } it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; @@ -4463,10 +4467,14 @@ repeat: else it->task_pos = it->task_pos->next; - if (it->task_pos == it->tasks_head) + if (it->task_pos == it->tasks_head) { it->task_pos = it->mg_tasks_head->next; - if (it->task_pos == it->mg_tasks_head) + it->cur_tasks_head = it->mg_tasks_head; + } + if (it->task_pos == it->mg_tasks_head) { it->task_pos = it->dying_tasks_head->next; + it->cur_tasks_head = it->dying_tasks_head; + } if (it->task_pos == it->dying_tasks_head) css_task_iter_advance_css_set(it); } else { @@ -4485,11 +4493,12 @@ repeat: goto repeat; /* and dying leaders w/o live member threads */ - if (!atomic_read(&task->signal->live)) + if (it->cur_tasks_head == it->dying_tasks_head && + !atomic_read(&task->signal->live)) goto repeat; } else { /* skip all dying ones */ - if (task->flags & PF_EXITING) + if (it->cur_tasks_head == it->dying_tasks_head) goto repeat; } } From 190ecb190a9cd8c0599d8499b901e3c32e87966a Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Sun, 23 Feb 2020 22:00:07 -0500 Subject: [PATCH 4/5] cgroup: fix psi_show() crash on 32bit ino archs Similar to the commit d7495343228f ("cgroup: fix incorrect WARN_ON_ONCE() in cgroup_setup_root()"), cgroup_id(root_cgrp) does not equal to 1 on 32bit ino archs which triggers all sorts of issues with psi_show() on s390x. For example, BUG: KASAN: slab-out-of-bounds in collect_percpu_times+0x2d0/ Read of size 4 at addr 000000001e0ce000 by task read_all/3667 collect_percpu_times+0x2d0/0x798 psi_show+0x7c/0x2a8 seq_read+0x2ac/0x830 vfs_read+0x92/0x150 ksys_read+0xe2/0x188 system_call+0xd8/0x2b4 Fix it by using cgroup_ino(). Fixes: 743210386c03 ("cgroup: use cgrp->kn->id as the cgroup ID") Signed-off-by: Qian Cai Acked-by: Johannes Weiner Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v5.5 --- kernel/cgroup/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c719a4154d6d..7a39dc882095 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3542,21 +3542,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_CPU); } From 2e5383d7904e60529136727e49629a82058a5607 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Wed, 19 Feb 2020 12:01:29 -0700 Subject: [PATCH 5/5] cgroup1: don't call release_agent when it is "" Older (and maybe current) versions of systemd set release_agent to "" when shutting down, but do not set notify_on_release to 0. Since 64e90a8acb85 ("Introduce STATIC_USERMODEHELPER to mediate call_usermodehelper()"), we filter out such calls when the user mode helper path is "". However, when used in conjunction with an actual (i.e. non "") STATIC_USERMODEHELPER, the path is never "", so the real usermode helper will be called with argv[0] == "". Let's avoid this by not invoking the release_agent when it is "". Signed-off-by: Tycho Andersen Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 9a6f060cbf51..f2d7cea86ffe 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -783,7 +783,7 @@ void cgroup1_release_agent(struct work_struct *work) pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf) + if (!pathbuf || !agentbuf || !strlen(agentbuf)) goto out; spin_lock_irq(&css_set_lock);