From 2896b0f09f2617f953b8038978106cc4cbb4c52b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 5 May 2017 13:31:23 -0500 Subject: [PATCH 01/22] pids: Initialize leader_pid in init_task This is cheap and no cost so we might as well. Signed-off-by: "Eric W. Biederman" --- init/init_task.c | 1 + 1 file changed, 1 insertion(+) diff --git a/init/init_task.c b/init/init_task.c index 74f60baa2799..7914ffb8dc73 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -33,6 +33,7 @@ static struct signal_struct init_signals = { }, #endif INIT_CPU_TIMERS(init_signals) + .leader_pid = &init_struct_pid, INIT_PREV_CPUTIME(init_signals) }; From 1fb53567a3633740aac8761eb7023dc5671f0edb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 5 May 2017 13:45:14 -0500 Subject: [PATCH 02/22] pids: Move task_pid_type into sched/signal.h The function is general and inline so there is no need to hide it inside of exit.c Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 8 ++++++++ kernel/exit.c | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 113d1ad1ced7..d8ef0a3d2e7e 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -556,6 +556,14 @@ extern bool current_is_single_threaded(void); typedef int (*proc_visitor)(struct task_struct *p, void *data); void walk_process_tree(struct task_struct *top, proc_visitor, void *); +static inline +struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +{ + if (type != PIDTYPE_PID) + task = task->group_leader; + return task->pids[type].pid; +} + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; diff --git a/kernel/exit.c b/kernel/exit.c index c3c7ac560114..16432428fc6c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1001,14 +1001,6 @@ struct wait_opts { int notask_error; }; -static inline -struct pid *task_pid_type(struct task_struct *task, enum pid_type type) -{ - if (type != PIDTYPE_PID) - task = task->group_leader; - return task->pids[type].pid; -} - static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || From 7a36094d61bfe9843de5484ff0140227983ac5d5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Sep 2017 12:45:33 -0500 Subject: [PATCH 03/22] pids: Compute task_tgid using signal->leader_pid The cost is the the same and this removes the need to worry about complications that come from de_thread and group_leader changing. __task_pid_nr_ns has been updated to take advantage of this change. Signed-off-by: "Eric W. Biederman" --- arch/ia64/kernel/asm-offsets.c | 2 +- arch/ia64/kernel/fsys.S | 8 ++++---- drivers/platform/x86/thinkpad_acpi.c | 1 + fs/fuse/file.c | 1 + fs/notify/fanotify/fanotify.c | 1 + include/linux/sched.h | 5 ----- include/linux/sched/signal.h | 5 +++++ include/net/scm.h | 1 + kernel/pid.c | 15 ++++++++------- 9 files changed, 22 insertions(+), 17 deletions(-) diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index f4db2168d1b8..f5433bb7f04a 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c @@ -50,7 +50,6 @@ void foo(void) DEFINE(IA64_TASK_BLOCKED_OFFSET,offsetof (struct task_struct, blocked)); DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid)); - DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader)); DEFINE(IA64_TASK_TGIDLINK_OFFSET, offsetof (struct task_struct, pids[PIDTYPE_PID].pid)); DEFINE(IA64_PID_LEVEL_OFFSET, offsetof (struct pid, level)); DEFINE(IA64_PID_UPID_OFFSET, offsetof (struct pid, numbers[0])); @@ -68,6 +67,7 @@ void foo(void) DEFINE(IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,offsetof (struct signal_struct, group_stop_count)); DEFINE(IA64_SIGNAL_SHARED_PENDING_OFFSET,offsetof (struct signal_struct, shared_pending)); + DEFINE(IA64_SIGNAL_LEADER_PID_OFFSET, offsetof (struct signal_struct, leader_pid)); BLANK(); diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index fe742ffafc7a..eaf5a0d6f3e0 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -62,16 +62,16 @@ ENTRY(fsys_getpid) .prologue .altrp b6 .body - add r17=IA64_TASK_GROUP_LEADER_OFFSET,r16 + add r17=IA64_TASK_SIGNAL_OFFSET,r16 ;; - ld8 r17=[r17] // r17 = current->group_leader + ld8 r17=[r17] // r17 = current->signal add r9=TI_FLAGS+IA64_TASK_SIZE,r16 ;; ld4 r9=[r9] - add r17=IA64_TASK_TGIDLINK_OFFSET,r17 + add r17=IA64_SIGNAL_LEADER_PID_OFFSET,r17 ;; and r9=TIF_ALLWORK_MASK,r9 - ld8 r17=[r17] // r17 = current->group_leader->pids[PIDTYPE_PID].pid + ld8 r17=[r17] // r17 = current->signal->leader_pid ;; add r8=IA64_PID_LEVEL_OFFSET,r17 ;; diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index cae9b0595692..d556e95c532c 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a201fb0ac64f..b00a3f126a89 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index f90842efea13..6e828cb82e5e 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/sched.h b/include/linux/sched.h index 87bf02d93a27..a461ff89a3af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1202,11 +1202,6 @@ static inline struct pid *task_pid(struct task_struct *task) return task->pids[PIDTYPE_PID].pid; } -static inline struct pid *task_tgid(struct task_struct *task) -{ - return task->group_leader->pids[PIDTYPE_PID].pid; -} - /* * Without tasklist or RCU lock it is not safe to dereference * the result of task_pgrp/task_session even if task == current, diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index d8ef0a3d2e7e..b95a272c1ab5 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -564,6 +564,11 @@ struct pid *task_pid_type(struct task_struct *task, enum pid_type type) return task->pids[type].pid; } +static inline struct pid *task_tgid(struct task_struct *task) +{ + return task->signal->leader_pid; +} + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; diff --git a/include/net/scm.h b/include/net/scm.h index 903771c8d4e3..1ce365f4c256 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -8,6 +8,7 @@ #include #include #include +#include /* Well, we should have at least one descriptor open * to accept passed FDs 8) diff --git a/kernel/pid.c b/kernel/pid.c index 157fe4b19971..d0de2b59f86f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -421,13 +421,14 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (!ns) ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) { - if (type == __PIDTYPE_TGID) - type = PIDTYPE_PID; - - task = task->group_leader; - } - nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); + struct pid *pid; + if (type == PIDTYPE_PID) + pid = task_pid(task); + else if (type == __PIDTYPE_TGID) + pid = task_tgid(task); + else + pid = rcu_dereference(task->group_leader->pids[type].pid); + nr = pid_nr_ns(pid, ns); } rcu_read_unlock(); From 71dbc8a96953aa91c50181a526acb7f80af74f67 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 16 Jul 2017 21:39:32 -0500 Subject: [PATCH 04/22] kvm: Don't open code task_pid in kvm_vcpu_ioctl Signed-off-by: "Eric W. Biederman" --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ada21f47f22b..4c593acc4510 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2560,7 +2560,7 @@ static long kvm_vcpu_ioctl(struct file *filp, if (arg) goto out; oldpid = rcu_access_pointer(vcpu->pid); - if (unlikely(oldpid != current->pids[PIDTYPE_PID].pid)) { + if (unlikely(oldpid != task_pid(current))) { /* The thread running this VCPU changed. */ struct pid *newpid; From 2c4704756cab7cfa031ada4dab361562f0e357c0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Sep 2017 13:06:43 -0500 Subject: [PATCH 05/22] pids: Move the pgrp and session pid pointers from task_struct to signal_struct To access these fields the code always has to go to group leader so going to signal struct is no loss and is actually a fundamental simplification. This saves a little bit of memory by only allocating the pid pointer array once instead of once for every thread, and even better this removes a few potential races caused by the fact that group_leader can be changed by de_thread, while signal_struct can not. Signed-off-by: "Eric W. Biederman" --- arch/ia64/kernel/asm-offsets.c | 2 +- arch/ia64/kernel/fsys.S | 4 +-- fs/autofs/autofs_i.h | 1 + include/linux/init_task.h | 9 ------- include/linux/pid.h | 8 +----- include/linux/sched.h | 22 +++-------------- include/linux/sched/signal.h | 26 +++++++++++++++++--- init/init_task.c | 11 +++++---- kernel/fork.c | 23 +++++++++++++---- kernel/pid.c | 45 +++++++++++++++++----------------- 10 files changed, 78 insertions(+), 73 deletions(-) diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index f5433bb7f04a..c1f8a57855af 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c @@ -50,7 +50,7 @@ void foo(void) DEFINE(IA64_TASK_BLOCKED_OFFSET,offsetof (struct task_struct, blocked)); DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid)); - DEFINE(IA64_TASK_TGIDLINK_OFFSET, offsetof (struct task_struct, pids[PIDTYPE_PID].pid)); + DEFINE(IA64_TASK_THREAD_PID_OFFSET,offsetof (struct task_struct, thread_pid)); DEFINE(IA64_PID_LEVEL_OFFSET, offsetof (struct pid, level)); DEFINE(IA64_PID_UPID_OFFSET, offsetof (struct pid, numbers[0])); DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending)); diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index eaf5a0d6f3e0..e85ebdac678b 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -96,11 +96,11 @@ ENTRY(fsys_set_tid_address) .altrp b6 .body add r9=TI_FLAGS+IA64_TASK_SIZE,r16 - add r17=IA64_TASK_TGIDLINK_OFFSET,r16 + add r17=IA64_TASK_THREAD_PID_OFFSET,r16 ;; ld4 r9=[r9] tnat.z p6,p7=r32 // check argument register for being NaT - ld8 r17=[r17] // r17 = current->pids[PIDTYPE_PID].pid + ld8 r17=[r17] // r17 = current->thread_pid ;; and r9=TIF_ALLWORK_MASK,r9 add r8=IA64_PID_LEVEL_OFFSET,r17 diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 9400a9f6318a..502812289850 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/init_task.h b/include/linux/init_task.h index a454b8aeb938..a7083a45a26c 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -46,15 +46,6 @@ extern struct cred init_cred; #define INIT_CPU_TIMERS(s) #endif -#define INIT_PID_LINK(type) \ -{ \ - .node = { \ - .next = NULL, \ - .pprev = NULL, \ - }, \ - .pid = &init_struct_pid, \ -} - #define INIT_TASK_COMM "swapper" /* Attach to the init_task data structure for proper alignment */ diff --git a/include/linux/pid.h b/include/linux/pid.h index 7633d55d9a24..3d4c504dcc8c 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -67,12 +67,6 @@ struct pid extern struct pid init_struct_pid; -struct pid_link -{ - struct hlist_node node; - struct pid *pid; -}; - static inline struct pid *get_pid(struct pid *pid) { if (pid) @@ -177,7 +171,7 @@ pid_t pid_vnr(struct pid *pid); do { \ if ((pid) != NULL) \ hlist_for_each_entry_rcu((task), \ - &(pid)->tasks[type], pids[type].node) { + &(pid)->tasks[type], pid_links[type]) { /* * Both old and new leaders may be attached to diff --git a/include/linux/sched.h b/include/linux/sched.h index a461ff89a3af..445bdf5b1f64 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -775,7 +775,8 @@ struct task_struct { struct list_head ptrace_entry; /* PID/PID hash table linkage. */ - struct pid_link pids[PIDTYPE_MAX]; + struct pid *thread_pid; + struct hlist_node pid_links[PIDTYPE_MAX]; struct list_head thread_group; struct list_head thread_node; @@ -1199,22 +1200,7 @@ struct task_struct { static inline struct pid *task_pid(struct task_struct *task) { - return task->pids[PIDTYPE_PID].pid; -} - -/* - * Without tasklist or RCU lock it is not safe to dereference - * the result of task_pgrp/task_session even if task == current, - * we can race with another thread doing sys_setsid/sys_setpgid. - */ -static inline struct pid *task_pgrp(struct task_struct *task) -{ - return task->group_leader->pids[PIDTYPE_PGID].pid; -} - -static inline struct pid *task_session(struct task_struct *task) -{ - return task->group_leader->pids[PIDTYPE_SID].pid; + return task->thread_pid; } /* @@ -1263,7 +1249,7 @@ static inline pid_t task_tgid_nr(struct task_struct *tsk) */ static inline int pid_alive(const struct task_struct *p) { - return p->pids[PIDTYPE_PID].pid != NULL; + return p->thread_pid != NULL; } static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index b95a272c1ab5..2dcded16eb1e 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -146,7 +146,9 @@ struct signal_struct { #endif + /* PID/PID hash table linkage. */ struct pid *leader_pid; + struct pid *pids[PIDTYPE_MAX]; #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; @@ -559,9 +561,12 @@ void walk_process_tree(struct task_struct *top, proc_visitor, void *); static inline struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { - if (type != PIDTYPE_PID) - task = task->group_leader; - return task->pids[type].pid; + struct pid *pid; + if (type == PIDTYPE_PID) + pid = task_pid(task); + else + pid = task->signal->pids[type]; + return pid; } static inline struct pid *task_tgid(struct task_struct *task) @@ -569,6 +574,21 @@ static inline struct pid *task_tgid(struct task_struct *task) return task->signal->leader_pid; } +/* + * Without tasklist or RCU lock it is not safe to dereference + * the result of task_pgrp/task_session even if task == current, + * we can race with another thread doing sys_setsid/sys_setpgid. + */ +static inline struct pid *task_pgrp(struct task_struct *task) +{ + return task->signal->pids[PIDTYPE_PGID]; +} + +static inline struct pid *task_session(struct task_struct *task) +{ + return task->signal->pids[PIDTYPE_SID]; +} + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; diff --git a/init/init_task.c b/init/init_task.c index 7914ffb8dc73..db12a61259f1 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -34,6 +34,11 @@ static struct signal_struct init_signals = { #endif INIT_CPU_TIMERS(init_signals) .leader_pid = &init_struct_pid, + .pids = { + [PIDTYPE_PID] = &init_struct_pid, + [PIDTYPE_PGID] = &init_struct_pid, + [PIDTYPE_SID] = &init_struct_pid, + }, INIT_PREV_CPUTIME(init_signals) }; @@ -112,11 +117,7 @@ struct task_struct init_task INIT_CPU_TIMERS(init_task) .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), .timer_slack_ns = 50000, /* 50 usec default slack */ - .pids = { - [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), - [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), - [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), - }, + .thread_pid = &init_struct_pid, .thread_group = LIST_HEAD_INIT(init_task.thread_group), .thread_node = LIST_HEAD_INIT(init_signals.thread_head), #ifdef CONFIG_AUDITSYSCALL diff --git a/kernel/fork.c b/kernel/fork.c index 9440d61b925c..d2952162399b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1549,10 +1549,22 @@ static void posix_cpu_timers_init(struct task_struct *tsk) static inline void posix_cpu_timers_init(struct task_struct *tsk) { } #endif +static inline void init_task_pid_links(struct task_struct *task) +{ + enum pid_type type; + + for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { + INIT_HLIST_NODE(&task->pid_links[type]); + } +} + static inline void init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { - task->pids[type].pid = pid; + if (type == PIDTYPE_PID) + task->thread_pid = pid; + else + task->signal->pids[type] = pid; } static inline void rcu_copy_process(struct task_struct *p) @@ -1928,6 +1940,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } + init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -2036,13 +2049,13 @@ fork_out: return ERR_PTR(retval); } -static inline void init_idle_pids(struct pid_link *links) +static inline void init_idle_pids(struct task_struct *idle) { enum pid_type type; for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { - INIT_HLIST_NODE(&links[type].node); /* not really needed */ - links[type].pid = &init_struct_pid; + INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */ + init_task_pid(idle, type, &init_struct_pid); } } @@ -2052,7 +2065,7 @@ struct task_struct *fork_idle(int cpu) task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, cpu_to_node(cpu)); if (!IS_ERR(task)) { - init_idle_pids(task->pids); + init_idle_pids(task); init_idle(task, cpu); } diff --git a/kernel/pid.c b/kernel/pid.c index d0de2b59f86f..f8486d2e2346 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -265,27 +265,35 @@ struct pid *find_vpid(int nr) } EXPORT_SYMBOL_GPL(find_vpid); +static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) +{ + return (type == PIDTYPE_PID) ? + &task->thread_pid : + (type == __PIDTYPE_TGID) ? + &task->signal->leader_pid : + &task->signal->pids[type]; +} + /* * attach_pid() must be called with the tasklist_lock write-held. */ void attach_pid(struct task_struct *task, enum pid_type type) { - struct pid_link *link = &task->pids[type]; - hlist_add_head_rcu(&link->node, &link->pid->tasks[type]); + struct pid *pid = *task_pid_ptr(task, type); + hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } static void __change_pid(struct task_struct *task, enum pid_type type, struct pid *new) { - struct pid_link *link; + struct pid **pid_ptr = task_pid_ptr(task, type); struct pid *pid; int tmp; - link = &task->pids[type]; - pid = link->pid; + pid = *pid_ptr; - hlist_del_rcu(&link->node); - link->pid = new; + hlist_del_rcu(&task->pid_links[type]); + *pid_ptr = new; for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (!hlist_empty(&pid->tasks[tmp])) @@ -310,8 +318,9 @@ void change_pid(struct task_struct *task, enum pid_type type, void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { - new->pids[type].pid = old->pids[type].pid; - hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); + if (type == PIDTYPE_PID) + new->thread_pid = old->thread_pid; + hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } struct task_struct *pid_task(struct pid *pid, enum pid_type type) @@ -322,7 +331,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), lockdep_tasklist_lock_is_held()); if (first) - result = hlist_entry(first, struct task_struct, pids[(type)].node); + result = hlist_entry(first, struct task_struct, pid_links[(type)]); } return result; } @@ -360,9 +369,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { struct pid *pid; rcu_read_lock(); - if (type != PIDTYPE_PID) - task = task->group_leader; - pid = get_pid(rcu_dereference(task->pids[type].pid)); + pid = get_pid(rcu_dereference(*task_pid_ptr(task, type))); rcu_read_unlock(); return pid; } @@ -420,16 +427,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - if (likely(pid_alive(task))) { - struct pid *pid; - if (type == PIDTYPE_PID) - pid = task_pid(task); - else if (type == __PIDTYPE_TGID) - pid = task_tgid(task); - else - pid = rcu_dereference(task->group_leader->pids[type].pid); - nr = pid_nr_ns(pid, ns); - } + if (likely(pid_alive(task))) + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; From 6883f81aac6f44e7df70a6af189b3689ff52cbfb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 4 Jun 2017 04:32:13 -0500 Subject: [PATCH 06/22] pid: Implement PIDTYPE_TGID Everywhere except in the pid array we distinguish between a tasks pid and a tasks tgid (thread group id). Even in the enumeration we want that distinction sometimes so we have added __PIDTYPE_TGID. With leader_pid we almost have an implementation of PIDTYPE_TGID in struct signal_struct. Add PIDTYPE_TGID as a first class member of the pid_type enumeration and into the pids array. Then remove the __PIDTYPE_TGID special case and the leader_pid in signal_struct. The net size increase is just an extra pointer added to struct pid and an extra pair of pointers of an hlist_node added to task_struct. The effect on code maintenance is the removal of a number of special cases today and the potential to remove many more special cases as PIDTYPE_TGID gets used to it's fullest. The long term potential is allowing zombie thread group leaders to exit, which will remove a lot more special cases in the code. Signed-off-by: "Eric W. Biederman" --- arch/ia64/kernel/asm-offsets.c | 2 +- arch/ia64/kernel/fsys.S | 4 ++-- arch/s390/kernel/perf_cpum_sf.c | 2 +- fs/exec.c | 1 + include/linux/pid.h | 3 +-- include/linux/sched.h | 4 ++-- include/linux/sched/signal.h | 5 ++--- init/init_task.c | 2 +- kernel/events/core.c | 2 +- kernel/exit.c | 1 + kernel/fork.c | 3 ++- kernel/pid.c | 2 -- kernel/time/itimer.c | 5 +++-- kernel/time/posix-cpu-timers.c | 2 +- 14 files changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index c1f8a57855af..00e8e2a1eb19 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c @@ -67,7 +67,7 @@ void foo(void) DEFINE(IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,offsetof (struct signal_struct, group_stop_count)); DEFINE(IA64_SIGNAL_SHARED_PENDING_OFFSET,offsetof (struct signal_struct, shared_pending)); - DEFINE(IA64_SIGNAL_LEADER_PID_OFFSET, offsetof (struct signal_struct, leader_pid)); + DEFINE(IA64_SIGNAL_PIDS_TGID_OFFSET, offsetof (struct signal_struct, pids[PIDTYPE_TGID])); BLANK(); diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index e85ebdac678b..d80c99a5f55d 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -68,10 +68,10 @@ ENTRY(fsys_getpid) add r9=TI_FLAGS+IA64_TASK_SIZE,r16 ;; ld4 r9=[r9] - add r17=IA64_SIGNAL_LEADER_PID_OFFSET,r17 + add r17=IA64_SIGNAL_PIDS_TGID_OFFSET,r17 ;; and r9=TIF_ALLWORK_MASK,r9 - ld8 r17=[r17] // r17 = current->signal->leader_pid + ld8 r17=[r17] // r17 = current->signal->pids[PIDTYPE_TGID] ;; add r8=IA64_PID_LEVEL_OFFSET,r17 ;; diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 0292d68e7dde..ca0b7ae894bb 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -665,7 +665,7 @@ static void cpumsf_output_event_pid(struct perf_event *event, goto out; /* Update the process ID (see also kernel/events/core.c) */ - data->tid_entry.pid = cpumsf_pid_type(event, pid, __PIDTYPE_TGID); + data->tid_entry.pid = cpumsf_pid_type(event, pid, PIDTYPE_TGID); data->tid_entry.tid = cpumsf_pid_type(event, pid, PIDTYPE_PID); perf_output_sample(&handle, &header, data, event); diff --git a/fs/exec.c b/fs/exec.c index 2d4e0075bd24..79a11fbded7a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1146,6 +1146,7 @@ static int de_thread(struct task_struct *tsk) */ tsk->pid = leader->pid; change_pid(tsk, PIDTYPE_PID, task_pid(leader)); + transfer_pid(leader, tsk, PIDTYPE_TGID); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); diff --git a/include/linux/pid.h b/include/linux/pid.h index 3d4c504dcc8c..14a9a39da9c7 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -7,11 +7,10 @@ enum pid_type { PIDTYPE_PID, + PIDTYPE_TGID, PIDTYPE_PGID, PIDTYPE_SID, PIDTYPE_MAX, - /* only valid to __task_pid_nr_ns() */ - __PIDTYPE_TGID }; /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 445bdf5b1f64..06b4e3bda93a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1275,12 +1275,12 @@ static inline pid_t task_session_vnr(struct task_struct *tsk) static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { - return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, ns); + return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); } static inline pid_t task_tgid_vnr(struct task_struct *tsk) { - return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, NULL); + return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); } static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 2dcded16eb1e..ee30a5ba475f 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -147,7 +147,6 @@ struct signal_struct { #endif /* PID/PID hash table linkage. */ - struct pid *leader_pid; struct pid *pids[PIDTYPE_MAX]; #ifdef CONFIG_NO_HZ_FULL @@ -571,7 +570,7 @@ struct pid *task_pid_type(struct task_struct *task, enum pid_type type) static inline struct pid *task_tgid(struct task_struct *task) { - return task->signal->leader_pid; + return task->signal->pids[PIDTYPE_TGID]; } /* @@ -607,7 +606,7 @@ static inline bool thread_group_leader(struct task_struct *p) */ static inline bool has_group_leader_pid(struct task_struct *p) { - return task_pid(p) == p->signal->leader_pid; + return task_pid(p) == task_tgid(p); } static inline diff --git a/init/init_task.c b/init/init_task.c index db12a61259f1..4f97846256d7 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -33,9 +33,9 @@ static struct signal_struct init_signals = { }, #endif INIT_CPU_TIMERS(init_signals) - .leader_pid = &init_struct_pid, .pids = { [PIDTYPE_PID] = &init_struct_pid, + [PIDTYPE_TGID] = &init_struct_pid, [PIDTYPE_PGID] = &init_struct_pid, [PIDTYPE_SID] = &init_struct_pid, }, diff --git a/kernel/events/core.c b/kernel/events/core.c index 80cca2b30c4f..9025b1796ca8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1334,7 +1334,7 @@ static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { - return perf_event_pid_type(event, p, __PIDTYPE_TGID); + return perf_event_pid_type(event, p, PIDTYPE_TGID); } static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) diff --git a/kernel/exit.c b/kernel/exit.c index 16432428fc6c..25582b442955 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -73,6 +73,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) nr_threads--; detach_pid(p, PIDTYPE_PID); if (group_dead) { + detach_pid(p, PIDTYPE_TGID); detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); diff --git a/kernel/fork.c b/kernel/fork.c index d2952162399b..cc5be0d01ce6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1946,6 +1946,7 @@ static __latent_entropy struct task_struct *copy_process( init_task_pid(p, PIDTYPE_PID, pid); if (thread_group_leader(p)) { + init_task_pid(p, PIDTYPE_TGID, pid); init_task_pid(p, PIDTYPE_PGID, task_pgrp(current)); init_task_pid(p, PIDTYPE_SID, task_session(current)); @@ -1954,7 +1955,6 @@ static __latent_entropy struct task_struct *copy_process( p->signal->flags |= SIGNAL_UNKILLABLE; } - p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); /* * Inherit has_child_subreaper flag under the same @@ -1965,6 +1965,7 @@ static __latent_entropy struct task_struct *copy_process( p->real_parent->signal->is_child_subreaper; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); + attach_pid(p, PIDTYPE_TGID); attach_pid(p, PIDTYPE_PGID); attach_pid(p, PIDTYPE_SID); __this_cpu_inc(process_counts); diff --git a/kernel/pid.c b/kernel/pid.c index f8486d2e2346..de1cfc4f75a2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -269,8 +269,6 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) { return (type == PIDTYPE_PID) ? &task->thread_pid : - (type == __PIDTYPE_TGID) ? - &task->signal->leader_pid : &task->signal->pids[type]; } diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index f26acef5d7b4..9a65713c8309 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -139,9 +139,10 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) { struct signal_struct *sig = container_of(timer, struct signal_struct, real_timer); + struct pid *leader_pid = sig->pids[PIDTYPE_TGID]; - trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); - kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); + trace_itimer_expire(ITIMER_REAL, leader_pid, 0); + kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid); return HRTIMER_NORESTART; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 5a6251ac6f7a..40e6fae46cec 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -895,7 +895,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, trace_itimer_expire(signo == SIGPROF ? ITIMER_PROF : ITIMER_VIRTUAL, - tsk->signal->leader_pid, cur_time); + task_tgid(tsk), cur_time); __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); } From 019191342fecce4a461978a7191a43f313e19e86 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 16 Jul 2017 22:05:57 -0500 Subject: [PATCH 07/22] signal: Use PIDTYPE_TGID to clearly store where file signals will be sent When f_setown is called a pid and a pid type are stored. Replace the use of PIDTYPE_PID with PIDTYPE_TGID as PIDTYPE_TGID goes to the entire thread group. Replace the use of PIDTYPE_MAX with PIDTYPE_PID as PIDTYPE_PID now is only for a thread. Update the users of __f_setown to use PIDTYPE_TGID instead of PIDTYPE_PID. For now the code continues to capture task_pid (when task_tgid would really be appropriate), and iterate on PIDTYPE_PID (even when type == PIDTYPE_TGID) out of an abundance of caution to preserve existing behavior. Oleg Nesterov suggested using the test to ensure we use PIDTYPE_PID for tgid lookup also be used to avoid taking the tasklist lock. Suggested-by: Oleg Nesterov Signed-off-by: "Eric W. Biederman" --- drivers/net/tun.c | 2 +- drivers/tty/tty_io.c | 2 +- fs/fcntl.c | 54 ++++++++++++++++++++++--------------- fs/locks.c | 2 +- fs/notify/dnotify/dnotify.c | 3 ++- 5 files changed, 37 insertions(+), 26 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index a192a017cc68..9958b70ac1b0 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3216,7 +3216,7 @@ static int tun_chr_fasync(int fd, struct file *file, int on) goto out; if (on) { - __f_setown(file, task_pid(current), PIDTYPE_PID, 0); + __f_setown(file, task_pid(current), PIDTYPE_TGID, 0); tfile->flags |= TUN_FASYNC; } else tfile->flags &= ~TUN_FASYNC; diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index aba59521ad48..090fb7e78eea 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2122,7 +2122,7 @@ static int __tty_fasync(int fd, struct file *filp, int on) type = PIDTYPE_PGID; } else { pid = task_pid(current); - type = PIDTYPE_PID; + type = PIDTYPE_TGID; } get_pid(pid); spin_unlock_irqrestore(&tty->ctrl_lock, flags); diff --git a/fs/fcntl.c b/fs/fcntl.c index 12273b6ea56d..1523588fd759 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -116,7 +116,7 @@ int f_setown(struct file *filp, unsigned long arg, int force) struct pid *pid = NULL; int who = arg, ret = 0; - type = PIDTYPE_PID; + type = PIDTYPE_TGID; if (who < 0) { /* avoid overflow below */ if (who == INT_MIN) @@ -143,7 +143,7 @@ EXPORT_SYMBOL(f_setown); void f_delown(struct file *filp) { - f_modown(filp, NULL, PIDTYPE_PID, 1); + f_modown(filp, NULL, PIDTYPE_TGID, 1); } pid_t f_getown(struct file *filp) @@ -171,11 +171,11 @@ static int f_setown_ex(struct file *filp, unsigned long arg) switch (owner.type) { case F_OWNER_TID: - type = PIDTYPE_MAX; + type = PIDTYPE_PID; break; case F_OWNER_PID: - type = PIDTYPE_PID; + type = PIDTYPE_TGID; break; case F_OWNER_PGRP: @@ -206,11 +206,11 @@ static int f_getown_ex(struct file *filp, unsigned long arg) read_lock(&filp->f_owner.lock); owner.pid = pid_vnr(filp->f_owner.pid); switch (filp->f_owner.pid_type) { - case PIDTYPE_MAX: + case PIDTYPE_PID: owner.type = F_OWNER_TID; break; - case PIDTYPE_PID: + case PIDTYPE_TGID: owner.type = F_OWNER_PID; break; @@ -785,20 +785,25 @@ void send_sigio(struct fown_struct *fown, int fd, int band) read_lock(&fown->lock); type = fown->pid_type; - if (type == PIDTYPE_MAX) { + if (type == PIDTYPE_PID) group = 0; - type = PIDTYPE_PID; - } pid = fown->pid; if (!pid) goto out_unlock_fown; - - read_lock(&tasklist_lock); - do_each_pid_task(pid, type, p) { + + if (type <= PIDTYPE_TGID) { + rcu_read_lock(); + p = pid_task(pid, PIDTYPE_PID); send_sigio_to_task(p, fown, fd, band, group); - } while_each_pid_task(pid, type, p); - read_unlock(&tasklist_lock); + rcu_read_unlock(); + } else { + read_lock(&tasklist_lock); + do_each_pid_task(pid, type, p) { + send_sigio_to_task(p, fown, fd, band, group); + } while_each_pid_task(pid, type, p); + read_unlock(&tasklist_lock); + } out_unlock_fown: read_unlock(&fown->lock); } @@ -821,22 +826,27 @@ int send_sigurg(struct fown_struct *fown) read_lock(&fown->lock); type = fown->pid_type; - if (type == PIDTYPE_MAX) { + if (type == PIDTYPE_PID) group = 0; - type = PIDTYPE_PID; - } pid = fown->pid; if (!pid) goto out_unlock_fown; ret = 1; - - read_lock(&tasklist_lock); - do_each_pid_task(pid, type, p) { + + if (type <= PIDTYPE_TGID) { + rcu_read_lock(); + p = pid_task(pid, PIDTYPE_PID); send_sigurg_to_task(p, fown, group); - } while_each_pid_task(pid, type, p); - read_unlock(&tasklist_lock); + rcu_read_unlock(); + } else { + read_lock(&tasklist_lock); + do_each_pid_task(pid, type, p) { + send_sigurg_to_task(p, fown, group); + } while_each_pid_task(pid, type, p); + read_unlock(&tasklist_lock); + } out_unlock_fown: read_unlock(&fown->lock); return ret; diff --git a/fs/locks.c b/fs/locks.c index db7b6917d9c5..cfc059bda8ea 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -546,7 +546,7 @@ lease_setup(struct file_lock *fl, void **priv) if (!fasync_insert_entry(fa->fa_fd, filp, &fl->fl_fasync, fa)) *priv = NULL; - __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); + __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0); } static const struct lock_manager_operations lease_manager_ops = { diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index e2bea2ac5dfb..484f2c3a33bb 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -353,7 +354,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) goto out; } - __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); + __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0); error = attach_dn(dn, dn_mark, id, fd, filp, mask); /* !error means that we attached the dn to the dn_mark, so don't free it */ From 2118e1f53f6f0973a1d9a6a7dc9296959bf39ec0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 21 Jul 2018 00:00:29 -0500 Subject: [PATCH 08/22] posix-timers: Noralize good_sigevent In good_sigevent directly compute the default return value as "task_tgid(current)". This is exactly the same as "task_pid(current->group_leader)" but written more clearly. In the thread case first compute the thread's pid. Then veify that attached to that pid is a thread of the current thread group. This has the net effect of making the code a little clearer, and making it obvious that posix timers never look up a process by a the pid of a thread. Signed-off-by: "Eric W. Biederman" --- kernel/time/posix-timers.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index e08ce3f27447..2bdf08a2bae9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -433,11 +433,13 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) static struct pid *good_sigevent(sigevent_t * event) { - struct task_struct *rtn = current->group_leader; + struct pid *pid = task_tgid(current); + struct task_struct *rtn; switch (event->sigev_notify) { case SIGEV_SIGNAL | SIGEV_THREAD_ID: - rtn = find_task_by_vpid(event->sigev_notify_thread_id); + pid = find_vpid(event->sigev_notify_thread_id); + rtn = pid_task(pid, PIDTYPE_PID); if (!rtn || !same_thread_group(rtn, current)) return NULL; /* FALLTHRU */ @@ -447,7 +449,7 @@ static struct pid *good_sigevent(sigevent_t * event) return NULL; /* FALLTHRU */ case SIGEV_NONE: - return task_pid(rtn); + return pid; default: return NULL; } From 24122c7f4969adeeaeca3fb1656a31569e9aa59b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Jul 2018 14:30:23 -0500 Subject: [PATCH 09/22] signal: Pass pid and pid type into send_sigqueue Make the code more maintainable by performing more of the signal related work in send_sigqueue. A quick inspection of do_timer_create will show that this code path does not lookup a thread group by a thread's pid. Making it safe to find the task pointed to by it_pid with "pid_task(it_pid, type)"; This supports the changes needed in fork to tell if a signal was sent to a single process or a group of processes. Having the pid to task transition in signal.c will also make it easier to sort out races with de_thread and and the thread group leader exiting when it comes time to address that. Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 2 +- kernel/signal.c | 14 +++++++++----- kernel/time/posix-timers.c | 13 ++++--------- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index ee30a5ba475f..94558ffa82ab 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -330,7 +330,7 @@ extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); -extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group); +extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline int restart_syscall(void) diff --git a/kernel/signal.c b/kernel/signal.c index 8d8a940422a8..40feb14e276d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1664,17 +1664,20 @@ void sigqueue_free(struct sigqueue *q) __sigqueue_free(q); } -int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) +int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) { int sig = q->info.si_signo; struct sigpending *pending; + struct task_struct *t; unsigned long flags; int ret, result; BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); ret = -1; - if (!likely(lock_task_sighand(t, &flags))) + rcu_read_lock(); + t = pid_task(pid, type); + if (!t || !likely(lock_task_sighand(t, &flags))) goto ret; ret = 1; /* the signal is ignored */ @@ -1696,15 +1699,16 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) q->info.si_overrun = 0; signalfd_notify(t, sig); - pending = group ? &t->signal->shared_pending : &t->pending; + pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; list_add_tail(&q->list, &pending->list); sigaddset(&pending->signal, sig); - complete_signal(sig, t, group); + complete_signal(sig, t, type != PIDTYPE_PID); result = TRACE_SIGNAL_DELIVERED; out: - trace_signal_generate(sig, &q->info, t, group, result); + trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result); unlock_task_sighand(t, &flags); ret: + rcu_read_unlock(); return ret; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 2bdf08a2bae9..2d2e739fbc57 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -332,8 +332,8 @@ void posixtimer_rearm(struct siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { - struct task_struct *task; - int shared, ret = -1; + enum pid_type type; + int ret = -1; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->posixtimer_rearm(). @@ -347,13 +347,8 @@ int posix_timer_event(struct k_itimer *timr, int si_private) */ timr->sigq->info.si_sys_private = si_private; - rcu_read_lock(); - task = pid_task(timr->it_pid, PIDTYPE_PID); - if (task) { - shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); - ret = send_sigqueue(timr->sigq, task, shared); - } - rcu_read_unlock(); + type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; + ret = send_sigqueue(timr->sigq, timr->it_pid, type); /* If we failed to send the signal the timer stops. */ return ret > 0; } From 0102498083d58d8b17759642c602b525215e1a54 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Jul 2018 18:40:57 -0500 Subject: [PATCH 10/22] signal: Pass pid type into group_send_sig_info This passes the information we already have at the call sight into group_send_sig_info. Ultimatelly allowing for to better handle signals sent to a group of processes. Signed-off-by: "Eric W. Biederman" --- include/linux/signal.h | 4 +++- kernel/exit.c | 3 ++- kernel/signal.c | 10 ++++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/include/linux/signal.h b/include/linux/signal.h index 3c5200137b24..d8f2bf3d41e6 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -254,11 +254,13 @@ static inline int valid_signal(unsigned long sig) struct timespec; struct pt_regs; +enum pid_type; extern int next_signal(struct sigpending *pending, sigset_t *mask); extern int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, bool group); -extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p); +extern int group_send_sig_info(int sig, struct siginfo *info, + struct task_struct *p, enum pid_type type); extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); extern int sigprocmask(int, sigset_t *, sigset_t *); extern void set_current_blocked(sigset_t *); diff --git a/kernel/exit.c b/kernel/exit.c index 25582b442955..0e21e6d21f35 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -681,7 +681,8 @@ static void forget_original_parent(struct task_struct *father, t->parent = t->real_parent; if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, - SEND_SIG_NOINFO, t); + SEND_SIG_NOINFO, t, + PIDTYPE_TGID); } /* * If this is a threaded reparent there is no need to diff --git a/kernel/signal.c b/kernel/signal.c index 40feb14e276d..c7527338fe9d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1274,7 +1274,8 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, /* * send signal info to all the members of a group */ -int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, + enum pid_type type) { int ret; @@ -1301,7 +1302,7 @@ int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) success = 0; retval = -ESRCH; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - int err = group_send_sig_info(sig, info, p); + int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID); success |= !err; retval = err; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); @@ -1317,7 +1318,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) - error = group_send_sig_info(sig, info, p); + error = group_send_sig_info(sig, info, p, PIDTYPE_TGID); rcu_read_unlock(); if (likely(!p || error != -ESRCH)) return error; @@ -1420,7 +1421,8 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) for_each_process(p) { if (task_pid_vnr(p) > 1 && !same_thread_group(p, current)) { - int err = group_send_sig_info(sig, info, p); + int err = group_send_sig_info(sig, info, p, + PIDTYPE_MAX); ++count; if (err != -EPERM) retval = err; From 9c2db007787ef1aac6728c5e03d37b0ae935d122 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 21 Jul 2018 08:17:29 -0500 Subject: [PATCH 11/22] signal: Pass pid type into send_sigio_to_task & send_sigurg_to_task This information is already present and using it directly simplifies the logic of the code. Signed-off-by: "Eric W. Biederman" --- fs/fcntl.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index 1523588fd759..5d596a00f40b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -723,7 +723,7 @@ static inline int sigio_perm(struct task_struct *p, static void send_sigio_to_task(struct task_struct *p, struct fown_struct *fown, - int fd, int reason, int group) + int fd, int reason, enum pid_type type) { /* * F_SETSIG can change ->signum lockless in parallel, make @@ -767,11 +767,11 @@ static void send_sigio_to_task(struct task_struct *p, else si.si_band = mangle_poll(band_table[reason - POLL_IN]); si.si_fd = fd; - if (!do_send_sig_info(signum, &si, p, group)) + if (!do_send_sig_info(signum, &si, p, type != PIDTYPE_PID)) break; /* fall-through: fall back on the old plain SIGIO signal */ case 0: - do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group); + do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type != PIDTYPE_PID); } } @@ -780,14 +780,10 @@ void send_sigio(struct fown_struct *fown, int fd, int band) struct task_struct *p; enum pid_type type; struct pid *pid; - int group = 1; read_lock(&fown->lock); type = fown->pid_type; - if (type == PIDTYPE_PID) - group = 0; - pid = fown->pid; if (!pid) goto out_unlock_fown; @@ -795,12 +791,12 @@ void send_sigio(struct fown_struct *fown, int fd, int band) if (type <= PIDTYPE_TGID) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); - send_sigio_to_task(p, fown, fd, band, group); + send_sigio_to_task(p, fown, fd, band, type); rcu_read_unlock(); } else { read_lock(&tasklist_lock); do_each_pid_task(pid, type, p) { - send_sigio_to_task(p, fown, fd, band, group); + send_sigio_to_task(p, fown, fd, band, type); } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); } @@ -809,10 +805,10 @@ void send_sigio(struct fown_struct *fown, int fd, int band) } static void send_sigurg_to_task(struct task_struct *p, - struct fown_struct *fown, int group) + struct fown_struct *fown, enum pid_type type) { if (sigio_perm(p, fown, SIGURG)) - do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group); + do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type != PIDTYPE_PID); } int send_sigurg(struct fown_struct *fown) @@ -820,15 +816,11 @@ int send_sigurg(struct fown_struct *fown) struct task_struct *p; enum pid_type type; struct pid *pid; - int group = 1; int ret = 0; read_lock(&fown->lock); type = fown->pid_type; - if (type == PIDTYPE_PID) - group = 0; - pid = fown->pid; if (!pid) goto out_unlock_fown; @@ -838,12 +830,12 @@ int send_sigurg(struct fown_struct *fown) if (type <= PIDTYPE_TGID) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); - send_sigurg_to_task(p, fown, group); + send_sigurg_to_task(p, fown, type); rcu_read_unlock(); } else { read_lock(&tasklist_lock); do_each_pid_task(pid, type, p) { - send_sigurg_to_task(p, fown, group); + send_sigurg_to_task(p, fown, type); } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); } From 40b3b02535621027f56d248139e0e467573c3098 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 21 Jul 2018 10:45:15 -0500 Subject: [PATCH 12/22] signal: Pass pid type into do_send_sig_info This passes the information we already have at the call sight into do_send_sig_info. Ultimately allowing for better handling of signals sent to a group of processes during fork. Signed-off-by: "Eric W. Biederman" --- drivers/tty/sysrq.c | 2 +- fs/fcntl.c | 6 +++--- include/linux/signal.h | 2 +- kernel/signal.c | 10 +++++----- mm/oom_kill.c | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 6364890575ec..06ed20dd01ba 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -348,7 +348,7 @@ static void send_sig_all(int sig) if (is_global_init(p)) continue; - do_send_sig_info(sig, SEND_SIG_FORCED, p, true); + do_send_sig_info(sig, SEND_SIG_FORCED, p, PIDTYPE_MAX); } read_unlock(&tasklist_lock); } diff --git a/fs/fcntl.c b/fs/fcntl.c index 5d596a00f40b..a04accf6847f 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -767,11 +767,11 @@ static void send_sigio_to_task(struct task_struct *p, else si.si_band = mangle_poll(band_table[reason - POLL_IN]); si.si_fd = fd; - if (!do_send_sig_info(signum, &si, p, type != PIDTYPE_PID)) + if (!do_send_sig_info(signum, &si, p, type)) break; /* fall-through: fall back on the old plain SIGIO signal */ case 0: - do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type != PIDTYPE_PID); + do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type); } } @@ -808,7 +808,7 @@ static void send_sigurg_to_task(struct task_struct *p, struct fown_struct *fown, enum pid_type type) { if (sigio_perm(p, fown, SIGURG)) - do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type != PIDTYPE_PID); + do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type); } int send_sigurg(struct fown_struct *fown) diff --git a/include/linux/signal.h b/include/linux/signal.h index d8f2bf3d41e6..fe125b0335f7 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -258,7 +258,7 @@ enum pid_type; extern int next_signal(struct sigpending *pending, sigset_t *mask); extern int do_send_sig_info(int sig, struct siginfo *info, - struct task_struct *p, bool group); + struct task_struct *p, enum pid_type type); extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, enum pid_type type); extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); diff --git a/kernel/signal.c b/kernel/signal.c index c7527338fe9d..2c09e6143dd8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1161,13 +1161,13 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) } int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, - bool group) + enum pid_type type) { unsigned long flags; int ret = -ESRCH; if (lock_task_sighand(p, &flags)) { - ret = send_signal(sig, info, p, group); + ret = send_signal(sig, info, p, type != PIDTYPE_PID); unlock_task_sighand(p, &flags); } @@ -1284,7 +1284,7 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, rcu_read_unlock(); if (!ret && sig) - ret = do_send_sig_info(sig, info, p, true); + ret = do_send_sig_info(sig, info, p, type); return ret; } @@ -1448,7 +1448,7 @@ int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) if (!valid_signal(sig)) return -EINVAL; - return do_send_sig_info(sig, info, p, false); + return do_send_sig_info(sig, info, p, PIDTYPE_PID); } #define __si_special(priv) \ @@ -3199,7 +3199,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) * probe. No signal is actually delivered. */ if (!error && sig) { - error = do_send_sig_info(sig, info, p, false); + error = do_send_sig_info(sig, info, p, PIDTYPE_PID); /* * If lock_task_sighand() failed we pretend the task * dies after receiving the signal. The window is tiny, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 84081e77bc51..2cc9b238368f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -920,7 +920,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * in order to prevent the OOM victim from depleting the memory * reserves from the user space under its control. */ - do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID); mark_oom_victim(victim); pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), @@ -958,7 +958,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) */ if (unlikely(p->flags & PF_KTHREAD)) continue; - do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID); } rcu_read_unlock(); From b213984bd3987e6f5bf96b7847d151b7b42e1efd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Jul 2018 15:49:17 -0500 Subject: [PATCH 13/22] signal: Push pid type down into send_signal This information is already available in the callers and by pushing it down it makes the code a little clearer, and allows better group signal behavior in fork. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index 2c09e6143dd8..8decc70c1dc2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1103,7 +1103,7 @@ ret: } static int send_signal(int sig, struct siginfo *info, struct task_struct *t, - int group) + enum pid_type type) { int from_ancestor_ns = 0; @@ -1112,7 +1112,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, !task_pid_nr_ns(current, task_active_pid_ns(t)); #endif - return __send_signal(sig, info, t, group, from_ancestor_ns); + return __send_signal(sig, info, t, type != PIDTYPE_PID, from_ancestor_ns); } static void print_fatal_signal(int signr) @@ -1151,13 +1151,13 @@ __setup("print-fatal-signals=", setup_print_fatal_signals); int __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - return send_signal(sig, info, p, 1); + return send_signal(sig, info, p, PIDTYPE_TGID); } static int specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) { - return send_signal(sig, info, t, 0); + return send_signal(sig, info, t, PIDTYPE_PID); } int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, @@ -1167,7 +1167,7 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, int ret = -ESRCH; if (lock_task_sighand(p, &flags)) { - ret = send_signal(sig, info, p, type != PIDTYPE_PID); + ret = send_signal(sig, info, p, type); unlock_task_sighand(p, &flags); } @@ -3966,7 +3966,7 @@ void kdb_send_sig(struct task_struct *t, int sig) "the deadlock.\n"); return; } - ret = send_signal(sig, SEND_SIG_PRIV, t, false); + ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); spin_unlock(&t->sighand->siglock); if (ret) kdb_printf("Fail to deliver Signal %d to process %d.\n", From 5a883cee7442a5fd72e18eae895113fbd7b16110 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Jul 2018 19:26:27 -0500 Subject: [PATCH 14/22] signal: Push pid type down into __send_signal This information is already available in the callers and by pushing it down it makes the code a little clearer, and allows implementing better handling of signales set to a group of processes in fork. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index 8decc70c1dc2..1ef94303d87a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -998,7 +998,7 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str #endif static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, - int group, int from_ancestor_ns) + enum pid_type type, int from_ancestor_ns) { struct sigpending *pending; struct sigqueue *q; @@ -1012,7 +1012,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, from_ancestor_ns || (info == SEND_SIG_FORCED))) goto ret; - pending = group ? &t->signal->shared_pending : &t->pending; + pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; /* * Short-circuit ignored signals and support queuing * exactly one non-rt signal, so that we can get more @@ -1096,9 +1096,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); - complete_signal(sig, t, group); + complete_signal(sig, t, type != PIDTYPE_PID); ret: - trace_signal_generate(sig, info, t, group, result); + trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result); return ret; } @@ -1112,7 +1112,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, !task_pid_nr_ns(current, task_active_pid_ns(t)); #endif - return __send_signal(sig, info, t, type != PIDTYPE_PID, from_ancestor_ns); + return __send_signal(sig, info, t, type, from_ancestor_ns); } static void print_fatal_signal(int signr) @@ -1377,7 +1377,7 @@ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, if (sig) { if (lock_task_sighand(p, &flags)) { - ret = __send_signal(sig, info, p, 1, 0); + ret = __send_signal(sig, info, p, PIDTYPE_TGID, 0); unlock_task_sighand(p, &flags); } else ret = -ESRCH; From 0729614992c946f6e8ccb9ef260aea1f06993df0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Jul 2018 21:39:13 -0500 Subject: [PATCH 15/22] signal: Push pid type down into complete_signal. This is the bottom and by pushing this down it simplifies the callers and otherwise leaves things as is. This is in preparation for allowing fork to implement better handling of signals set to groups of processes. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index 1ef94303d87a..dddbea558455 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -895,7 +895,7 @@ static inline int wants_signal(int sig, struct task_struct *p) return task_curr(p) || !signal_pending(p); } -static void complete_signal(int sig, struct task_struct *p, int group) +static void complete_signal(int sig, struct task_struct *p, enum pid_type type) { struct signal_struct *signal = p->signal; struct task_struct *t; @@ -908,7 +908,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) */ if (wants_signal(sig, p)) t = p; - else if (!group || thread_group_empty(p)) + else if ((type == PIDTYPE_PID) || thread_group_empty(p)) /* * There is just one thread and it does not need to be woken. * It will dequeue unblocked signals before it runs again. @@ -1096,7 +1096,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); - complete_signal(sig, t, type != PIDTYPE_PID); + complete_signal(sig, t, type); ret: trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result); return ret; @@ -1704,7 +1704,7 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; list_add_tail(&q->list, &pending->list); sigaddset(&pending->signal, sig); - complete_signal(sig, t, type != PIDTYPE_PID); + complete_signal(sig, t, type); result = TRACE_SIGNAL_DELIVERED; out: trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result); From 4ca1d3ee46130e9b939c02a93e3970dad151fed6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Jul 2018 15:30:33 -0500 Subject: [PATCH 16/22] fork: Move and describe why the code examines PIDNS_ADDING Normally this would be something that would be handled by handling signals that are sent to a group of processes but in this case the forking process is not a member of the group being signaled. Thus special code is needed to prevent a race with pid namespaces exiting, and fork adding new processes within them. Move this test up before the signal restart just in case signals are also pending. Fatal conditions should take presedence over restarts. Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index cc5be0d01ce6..b9c54318a292 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1922,6 +1922,12 @@ static __latent_entropy struct task_struct *copy_process( rseq_fork(p, clone_flags); + /* Don't start children in a dying pid namespace */ + if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { + retval = -ENOMEM; + goto bad_fork_cancel_cgroup; + } + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the @@ -1935,10 +1941,7 @@ static __latent_entropy struct task_struct *copy_process( retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } - if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { - retval = -ENOMEM; - goto bad_fork_cancel_cgroup; - } + init_task_pid_links(p); if (likely(p->pid)) { From 7673bf553b2732a00f7644fb2adadda69389ab37 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Jul 2018 08:01:10 -0500 Subject: [PATCH 17/22] fork: Unconditionally exit if a fatal signal is pending In practice this does not change anything as testing for fatal_signal_pending and exiting for with an error code duplicates the work of the next clause which recalculates pending signals and then exits fork if any are pending. In both cases the pending signal will trigger the slow path when existing to userspace, and the fatal signal will cause do_exit to be called. The advantage of making this a separate test is that it makes it clear processing the fatal signal will terminate the fork, and it allows the rest of the signal logic to be updated without fear that this important case will be lost. Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/fork.c b/kernel/fork.c index b9c54318a292..22d4cdb9a7ca 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1928,6 +1928,12 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } + /* Let kill terminate clone/fork in the middle */ + if (fatal_signal_pending(current)) { + retval = -EINTR; + goto bad_fork_cancel_cgroup; + } + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the From 088fe47ce952542389e604e83f533811750aaf7c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Jul 2018 17:26:49 -0500 Subject: [PATCH 18/22] signal: Add calculate_sigpending() Add a function calculate_sigpending to test to see if any signals are pending for a new task immediately following fork. Signals have to happen either before or after fork. Today our practice is to push all of the signals to before the fork, but that has the downside that frequent or periodic signals can make fork take much much longer than normal or prevent fork from completing entirely. So we need move signals that we can after the fork to prevent that. This updates the code to set TIF_SIGPENDING on a new task if there are signals or other activities that have moved so that they appear to happen after the fork. As the code today restarts if it sees any such activity this won't immediately have an effect, as there will be no reason for it to set TIF_SIGPENDING immediately after the fork. Adding calculate_sigpending means the code in fork can safely be changed to not always restart if a signal is pending. The new calculate_sigpending function sets sigpending if there are pending bits in jobctl, pending signals, the freezer needs to freeze the new task or the live kernel patching framework need the new thread to take the slow path to userspace. I have verified that setting TIF_SIGPENDING does make a new process take the slow path to userspace before it executes it's first userspace instruction. I have looked at the callers of signal_wake_up and the code paths setting TIF_SIGPENDING and I don't see anything else that needs to be handled. The code probably doesn't need to set TIF_SIGPENDING for the kernel live patching as it uses a separate thread flag as well. But at this point it seems safer reuse the recalc_sigpending logic and get the kernel live patching folks to sort out their story later. V2: I have moved the test into schedule_tail where siglock can be grabbed and recalc_sigpending can be reused directly. Further as the last action of setting up a new task this guarantees that TIF_SIGPENDING will be properly set in the new process. The helper calculate_sigpending takes the siglock and uncontitionally sets TIF_SIGPENDING and let's recalc_sigpending clear TIF_SIGPENDING if it is unnecessary. This allows reusing the existing code and keeps maintenance of the conditions simple. Oleg Nesterov suggested the movement and pointed out the need to take siglock if this code was going to be called while the new task is discoverable. Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 1 + kernel/sched/core.c | 2 ++ kernel/signal.c | 11 +++++++++++ 3 files changed, 14 insertions(+) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 94558ffa82ab..b55fd293c1e5 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -372,6 +372,7 @@ static inline int signal_pending_state(long state, struct task_struct *p) */ extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); +extern void calculate_sigpending(void); extern void signal_wake_up_state(struct task_struct *t, unsigned int state); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78d8facba456..3e4ed4b7aa2d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2813,6 +2813,8 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); + + calculate_sigpending(); } /* diff --git a/kernel/signal.c b/kernel/signal.c index dddbea558455..1e06f1eba363 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -172,6 +172,17 @@ void recalc_sigpending(void) } +void calculate_sigpending(void) +{ + /* Have any signals or users of TIF_SIGPENDING been delayed + * until after fork? + */ + spin_lock_irq(¤t->sighand->siglock); + set_tsk_thread_flag(current, TIF_SIGPENDING); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); +} + /* Given the mask, find the first available signal that should be serviced. */ #define SYNCHRONOUS_MASK \ From 4390e9eadbbb6774b7ba03fde0a0fdf3f07db4cd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 3 Aug 2018 20:10:54 -0500 Subject: [PATCH 19/22] fork: Skip setting TIF_SIGPENDING in ptrace_init_task The code in calculate_sigpending will now handle this so it is just redundant and possibly a little confusing to continue setting TIF_SIGPENDING in ptrace_init_task. Suggested-by: Oleg Nesterov Signed-off-by: "Eric W. Biederman" --- include/linux/ptrace.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 037bf0ef1ae9..4f36431c380b 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -214,8 +214,6 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) task_set_jobctl_pending(child, JOBCTL_TRAP_STOP); else sigaddset(&child->pending.signal, SIGSTOP); - - set_tsk_thread_flag(child, TIF_SIGPENDING); } else child->ptracer_cred = NULL; From 924de3b8c9410c404c6eda7abffd282b97b3ff7f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Jul 2018 13:38:00 -0500 Subject: [PATCH 20/22] fork: Have new threads join on-going signal group stops There are only two signals that are delivered to every member of a signal group: SIGSTOP and SIGKILL. Signal delivery requires every signal appear to be delivered either before or after a clone syscall. SIGKILL terminates the clone so does not need to be considered. Which leaves only SIGSTOP that needs to be considered when creating new threads. Today in the event of a group stop TIF_SIGPENDING will get set and the fork will restart ensuring the fork syscall participates in the group stop. A fork (especially of a process with a lot of memory) is one of the most expensive system so we really only want to restart a fork when necessary. It is easy so check to see if a SIGSTOP is ongoing and have the new thread join it immediate after the clone completes. Making it appear the clone completed happened just before the SIGSTOP. The calculate_sigpending function will see the bits set in jobctl and set TIF_SIGPENDING to ensure the new task takes the slow path to userspace. V2: The call to task_join_group_stop was moved before the new task is added to the thread group list. This should not matter as sighand->siglock is held over both the addition of the threads, the call to task_join_group_stop and do_signal_stop. But the change is trivial and it is one less thing to worry about when reading the code. Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 2 ++ kernel/fork.c | 27 +++++++++++++++------------ kernel/signal.c | 14 ++++++++++++++ 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index b55fd293c1e5..ae2b0b81be25 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -385,6 +385,8 @@ static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) signal_wake_up_state(t, resume ? __TASK_TRACED : 0); } +void task_join_group_stop(struct task_struct *task); + #ifdef TIF_RESTORE_SIGMASK /* * Legacy restore_sigmask accessors. These are inefficient on diff --git a/kernel/fork.c b/kernel/fork.c index 22d4cdb9a7ca..ab731e15a600 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1934,18 +1934,20 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } - /* - * Process group and session signals need to be delivered to just the - * parent before the fork or both the parent and the child after the - * fork. Restart if a signal comes in before we add the new process to - * it's process group. - * A fatal signal pending means that current will exit, so the new - * thread can't slip out of an OOM kill (or normal SIGKILL). - */ - recalc_sigpending(); - if (signal_pending(current)) { - retval = -ERESTARTNOINTR; - goto bad_fork_cancel_cgroup; + if (!(clone_flags & CLONE_THREAD)) { + /* + * Process group and session signals need to be delivered to just the + * parent before the fork or both the parent and the child after the + * fork. Restart if a signal comes in before we add the new process to + * it's process group. + * A fatal signal pending means that current will exit, so the new + * thread can't slip out of an OOM kill (or normal SIGKILL). + */ + recalc_sigpending(); + if (signal_pending(current)) { + retval = -ERESTARTNOINTR; + goto bad_fork_cancel_cgroup; + } } @@ -1982,6 +1984,7 @@ static __latent_entropy struct task_struct *copy_process( current->signal->nr_threads++; atomic_inc(¤t->signal->live); atomic_inc(¤t->signal->sigcnt); + task_join_group_stop(p); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); list_add_tail_rcu(&p->thread_node, diff --git a/kernel/signal.c b/kernel/signal.c index 1e06f1eba363..9f0eafb6d474 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -373,6 +373,20 @@ static bool task_participate_group_stop(struct task_struct *task) return false; } +void task_join_group_stop(struct task_struct *task) +{ + /* Have the new thread join an on-going signal group stop */ + unsigned long jobctl = current->jobctl; + if (jobctl & JOBCTL_STOP_PENDING) { + struct signal_struct *sig = current->signal; + unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK; + unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; + if (task_set_jobctl_pending(task, signr | gstop)) { + sig->group_stop_count++; + } + } +} + /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an From c3ad2c3b02e953ead2b8d52a0c9e70312930c3d0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Jul 2018 15:20:37 -0500 Subject: [PATCH 21/22] signal: Don't restart fork when signals come in. Wen Yang and majiang report that a periodic signal received during fork can cause fork to continually restart preventing an application from making progress. The code was being overly pessimistic. Fork needs to guarantee that a signal sent to multiple processes is logically delivered before the fork and just to the forking process or logically delivered after the fork to both the forking process and it's newly spawned child. For signals like periodic timers that are always delivered to a single process fork can safely complete and let them appear to logically delivered after the fork(). While examining this issue I also discovered that fork today will miss signals delivered to multiple processes during the fork and handled by another thread. Similarly the current code will also miss blocked signals that are delivered to multiple process, as those signals will not appear pending during fork. Add a list of each thread that is currently forking, and keep on that list a signal set that records all of the signals sent to multiple processes. When fork completes initialize the new processes shared_pending signal set with it. The calculate_sigpending function will see those signals and set TIF_SIGPENDING causing the new task to take the slow path to userspace to handle those signals. Making it appear as if those signals were received immediately after the fork. It is not possible to send real time signals to multiple processes and exceptions don't go to multiple processes, which means that that are no signals sent to multiple processes that require siginfo. This means it is safe to not bother collecting siginfo on signals sent during fork. The sigaction of a child of fork is initially the same as the sigaction of the parent process. So a signal the parent ignores the child will also initially ignore. Therefore it is safe to ignore signals sent to multiple processes and ignored by the forking process. Signals sent to only a single process or only a single thread and delivered during fork are treated as if they are received after the fork, and generally not dealt with. They won't cause any problems. V2: Added removal from the multiprocess list on failure. V3: Use -ERESTARTNOINTR directly V4: - Don't queue both SIGCONT and SIGSTOP - Initialize signal_struct.multiprocess in init_task - Move setting of shared_pending to before the new task is visible to signals. This prevents signals from comming in before shared_pending.signal is set to delayed.signal and being lost. V5: - rework list add and delete to account for idle threads v6: - Use sigdelsetmask when removing stop signals Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=200447 Reported-by: Wen Yang and Reported-by: majiang Fixes: 4a2c7a7837da ("[PATCH] make fork() atomic wrt pgrp/session signals") Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 8 +++++++ init/init_task.c | 1 + kernel/fork.c | 43 +++++++++++++++++++++--------------- kernel/signal.c | 15 +++++++++++++ 4 files changed, 49 insertions(+), 18 deletions(-) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index ae2b0b81be25..4e9b77fb702d 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -69,6 +69,11 @@ struct thread_group_cputimer { bool checking_timer; }; +struct multiprocess_signals { + sigset_t signal; + struct hlist_node node; +}; + /* * NOTE! "signal_struct" does not have its own * locking, because a shared signal_struct always @@ -90,6 +95,9 @@ struct signal_struct { /* shared signal handling: */ struct sigpending shared_pending; + /* For collecting multiprocess signals during fork */ + struct hlist_head multiprocess; + /* thread group exit support */ int group_exit_code; /* overloaded: diff --git a/init/init_task.c b/init/init_task.c index 4f97846256d7..5aebe3be4d7c 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -22,6 +22,7 @@ static struct signal_struct init_signals = { .list = LIST_HEAD_INIT(init_signals.shared_pending.list), .signal = {{0}} }, + .multiprocess = HLIST_HEAD_INIT, .rlim = INIT_RLIMITS, .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), #ifdef CONFIG_POSIX_TIMERS diff --git a/kernel/fork.c b/kernel/fork.c index ab731e15a600..411e34acace7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1456,6 +1456,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); + INIT_HLIST_HEAD(&sig->multiprocess); seqlock_init(&sig->stats_lock); prev_cputime_init(&sig->prev_cputime); @@ -1602,6 +1603,7 @@ static __latent_entropy struct task_struct *copy_process( { int retval; struct task_struct *p; + struct multiprocess_signals delayed; /* * Don't allow sharing the root directory with processes in a different @@ -1649,6 +1651,24 @@ static __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + /* + * Force any signals received before this point to be delivered + * before the fork happens. Collect up signals sent to multiple + * processes that happen during the fork and delay them so that + * they appear to happen after the fork. + */ + sigemptyset(&delayed.signal); + INIT_HLIST_NODE(&delayed.node); + + spin_lock_irq(¤t->sighand->siglock); + if (!(clone_flags & CLONE_THREAD)) + hlist_add_head(&delayed.node, ¤t->signal->multiprocess); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + retval = -ERESTARTNOINTR; + if (signal_pending(current)) + goto fork_out; + retval = -ENOMEM; p = dup_task_struct(current, node); if (!p) @@ -1934,22 +1954,6 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } - if (!(clone_flags & CLONE_THREAD)) { - /* - * Process group and session signals need to be delivered to just the - * parent before the fork or both the parent and the child after the - * fork. Restart if a signal comes in before we add the new process to - * it's process group. - * A fatal signal pending means that current will exit, so the new - * thread can't slip out of an OOM kill (or normal SIGKILL). - */ - recalc_sigpending(); - if (signal_pending(current)) { - retval = -ERESTARTNOINTR; - goto bad_fork_cancel_cgroup; - } - } - init_task_pid_links(p); if (likely(p->pid)) { @@ -1965,7 +1969,7 @@ static __latent_entropy struct task_struct *copy_process( ns_of_pid(pid)->child_reaper = p; p->signal->flags |= SIGNAL_UNKILLABLE; } - + p->signal->shared_pending.signal = delayed.signal; p->signal->tty = tty_kref_get(current->signal->tty); /* * Inherit has_child_subreaper flag under the same @@ -1993,8 +1997,8 @@ static __latent_entropy struct task_struct *copy_process( attach_pid(p, PIDTYPE_PID); nr_threads++; } - total_forks++; + hlist_del_init(&delayed.node); spin_unlock(¤t->sighand->siglock); syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); @@ -2059,6 +2063,9 @@ bad_fork_free: put_task_stack(p); free_task(p); fork_out: + spin_lock_irq(¤t->sighand->siglock); + hlist_del_init(&delayed.node); + spin_unlock_irq(¤t->sighand->siglock); return ERR_PTR(retval); } diff --git a/kernel/signal.c b/kernel/signal.c index 9f0eafb6d474..cfa9d10e731a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1121,6 +1121,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); + + /* Let multiprocess signals appear after on-going forks */ + if (type > PIDTYPE_TGID) { + struct multiprocess_signals *delayed; + hlist_for_each_entry(delayed, &t->signal->multiprocess, node) { + sigset_t *signal = &delayed->signal; + /* Can't queue both a stop and a continue signal */ + if (sig == SIGCONT) + sigdelsetmask(signal, SIG_KERNEL_STOP_MASK); + else if (sig_kernel_stop(sig)) + sigdelset(signal, SIGCONT); + sigaddset(signal, sig); + } + } + complete_signal(sig, t, type); ret: trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result); From 84fe4cc09abc1a5ef3a282db3ed10f4d3f1e6a0b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 Aug 2018 21:20:46 -0500 Subject: [PATCH 22/22] signal: Don't send signals to tasks that don't exist Recently syzbot reported crashes in send_sigio_to_task and send_sigurg_to_task in linux-next. Despite finding a reproducer syzbot apparently did not bisected this or otherwise track down the offending commit in linux-next. I happened to see this report and examined the code because I had recently changed these functions as part of making PIDTYPE_TGID a real pid type so that fork would does not need to restart when receiving a signal. By examination I see that I spotted a bug in the code that could explain the reported crashes. When I took Oleg's suggestion and optimized send_sigurg and send_sigio to only send to a single task when type is PIDTYPE_PID or PIDTYPE_TGID I failed to handle pids that no longer point to tasks. The macro do_each_pid_task simply iterates for zero iterations. With pid_task an explicit NULL test is needed. Update the code to include the missing NULL test. Fixes: 019191342fec ("signal: Use PIDTYPE_TGID to clearly store where file signals will be sent") Reported-by: syzkaller-bugs@googlegroups.com Signed-off-by: "Eric W. Biederman" --- fs/fcntl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index a04accf6847f..4137d96534a6 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -791,7 +791,8 @@ void send_sigio(struct fown_struct *fown, int fd, int band) if (type <= PIDTYPE_TGID) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); - send_sigio_to_task(p, fown, fd, band, type); + if (p) + send_sigio_to_task(p, fown, fd, band, type); rcu_read_unlock(); } else { read_lock(&tasklist_lock); @@ -830,7 +831,8 @@ int send_sigurg(struct fown_struct *fown) if (type <= PIDTYPE_TGID) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); - send_sigurg_to_task(p, fown, type); + if (p) + send_sigurg_to_task(p, fown, type); rcu_read_unlock(); } else { read_lock(&tasklist_lock);