prlimit and set/getpriority tasklist_lock optimizations
The tasklist_lock popped up as a scalability bottleneck on some testing workloads. The readlocks in do_prlimit and set/getpriority are not necessary in all cases. Based on a cycles profile, it looked like ~87% of the time was spent in the kernel, ~42% of which was just trying to get *some* spinlock (queued_spin_lock_slowpath, not necessarily the tasklist_lock). The big offenders (with rough percentages in cycles of the overall trace): - do_wait 11% - setpriority 8% (this patchset) - kill 8% - do_exit 5% - clone 3% - prlimit64 2% (this patchset) - getrlimit 1% (this patchset) I can't easily test this patchset on the original workload for various reasons. Instead, I used the microbenchmark below to at least verify there was some improvement. This patchset had a 28% speedup (12% from baseline to set/getprio, then another 14% for prlimit). One interesting thing is that my libc's getrlimit() was calling prlimit64, so hoisting the read_lock(tasklist_lock) into sys_prlimit64 had no effect - it essentially optimized the older syscalls only. I didn't do that in this patchset, but figured I'd mention it since it was an option from the previous patch's discussion. v3: https://lkml.kernel.org/r/20220106172041.522167-1-brho@google.com v2: https://lore.kernel.org/lkml/20220105212828.197013-1-brho@google.com/ - update_rlimit_cpu on the group_leader instead of for_each_thread. - update_rlimit_cpu still returns 0 or -ESRCH, even though we don't care about the error here. it felt safer that way in case someone uses that function again. v1: https://lore.kernel.org/lkml/20211213220401.1039578-1-brho@google.com/ int main(int argc, char **argv) { pid_t child; struct rlimit rlim[1]; fork(); fork(); fork(); fork(); fork(); fork(); for (int i = 0; i < 5000; i++) { child = fork(); if (child < 0) exit(1); if (child > 0) { usleep(1000); kill(child, SIGTERM); waitpid(child, NULL, 0); } else { for (;;) { setpriority(PRIO_PROCESS, 0, getpriority(PRIO_PROCESS, 0)); getrlimit(RLIMIT_CPU, rlim); } } } return 0; } Barret Rhoden (3): setpriority: only grab the tasklist_lock for PRIO_PGRP prlimit: make do_prlimit() static prlimit: do not grab the tasklist_lock include/linux/posix-timers.h | 2 +- include/linux/resource.h | 2 - kernel/sys.c | 127 +++++++++++++++++---------------- kernel/time/posix-cpu-timers.c | 12 +++- 4 files changed, 76 insertions(+), 67 deletions(-) I have dropped the first change in this series as an almost identical change was merged as commit7f8ca0edfe
("kernel/sys.c: only take tasklist_lock for get/setpriority(PRIO_PGRP)"). Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEgjlraLDcwBA2B+6cC/v6Eiajj0AFAmI7eCAACgkQC/v6Eiaj j0CN8w/+MEol1+sB/mDKgDgqbNE0sIXHTjQF37KPrsqB51aas9LSX7E7CBzvxF3M Y0MSk0VzSt4oGpmrNQOAEueeMeaMucPxI5JejGHEhtdHFBMqYXKpWuhqewIHx1pc lUcYpDeUOOBjwLO/VT5hfAKzIEMUl6tEDfzexl9IvpVwd661nVjDe+z12mDplJTi tjO8ZiSHkjkLE3cAYaTCajsaqpj7NLuIYB1d4CbbpU3vO5LYoffj/vtQ1e+7UxMB jhgaP/ylo0Ab8udYJ0PFIDmmQG/6s7csc3I1wtMgf8mqv88z4xspXNZBwYvf2hxa lBpSo+zD8Q88XipC+w63iBUa7YElLaai9xpLInO/Ir42G03/H/8TS9me1OLG+1Cz vloOid6CqH7KkNQ842txXeyj3xjW1DGR7U0QOrSxFQuWc6WZ2Q/l8KIZsuXuyt9G EwTjtoQvr1R+FNMtT/4g5WZ8sTYooIaHFvFQ745T6FzBp8mCVjINg4SUbVV3Wvck JRMxuHSFFBXj8IIJi9Bv6UE/j5APwa209KthvFCQayniNZU3XPKVa/bDWVoBk+SK Hch3M//QdAjKYmRf5gmDaBbRyqzaeiFjvX1MSnkbFryBX4/yIoEfo0/QsDRzSrJV vSSSU79h/XDI080gILOzNX4HiI4cpNcpOIB63Pmajyr6MxhrMqE= =VVGP -----END PGP SIGNATURE----- Merge tag 'prlimit-tasklist_lock-for-v5.18' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace Pull tasklist_lock optimizations from Eric Biederman: "prlimit and getpriority tasklist_lock optimizations The tasklist_lock popped up as a scalability bottleneck on some testing workloads. The readlocks in do_prlimit and set/getpriority are not necessary in all cases. Based on a cycles profile, it looked like ~87% of the time was spent in the kernel, ~42% of which was just trying to get *some* spinlock (queued_spin_lock_slowpath, not necessarily the tasklist_lock). The big offenders (with rough percentages in cycles of the overall trace): - do_wait 11% - setpriority 8% (done previously in commit7f8ca0edfe
) - kill 8% - do_exit 5% - clone 3% - prlimit64 2% (this patchset) - getrlimit 1% (this patchset) I can't easily test this patchset on the original workload for various reasons. Instead, I used the microbenchmark below to at least verify there was some improvement. This patchset had a 28% speedup (12% from baseline to set/getprio, then another 14% for prlimit). This series used to do the setpriority case, but an almost identical change was merged as commit7f8ca0edfe
("kernel/sys.c: only take tasklist_lock for get/setpriority(PRIO_PGRP)") so that has been dropped from here. One interesting thing is that my libc's getrlimit() was calling prlimit64, so hoisting the read_lock(tasklist_lock) into sys_prlimit64 had no effect - it essentially optimized the older syscalls only. I didn't do that in this patchset, but figured I'd mention it since it was an option from the previous patch's discussion" micobenchmark.c: --------------- int main(int argc, char **argv) { pid_t child; struct rlimit rlim[1]; fork(); fork(); fork(); fork(); fork(); fork(); for (int i = 0; i < 5000; i++) { child = fork(); if (child < 0) exit(1); if (child > 0) { usleep(1000); kill(child, SIGTERM); waitpid(child, NULL, 0); } else { for (;;) { setpriority(PRIO_PROCESS, 0, getpriority(PRIO_PROCESS, 0)); getrlimit(RLIMIT_CPU, rlim); } } } return 0; } Link: https://lore.kernel.org/lkml/20211213220401.1039578-1-brho@google.com/ [v1] Link: https://lore.kernel.org/lkml/20220105212828.197013-1-brho@google.com/ [v2] Link: https://lore.kernel.org/lkml/20220106172041.522167-1-brho@google.com/ [v3] * tag 'prlimit-tasklist_lock-for-v5.18' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: prlimit: do not grab the tasklist_lock prlimit: make do_prlimit() static
This commit is contained in:
commit
cd4699c5fd
|
@ -253,7 +253,7 @@ void posix_cpu_timers_exit_group(struct task_struct *task);
|
|||
void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
|
||||
u64 *newval, u64 *oldval);
|
||||
|
||||
void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);
|
||||
int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);
|
||||
|
||||
void posixtimer_rearm(struct kernel_siginfo *info);
|
||||
#endif
|
||||
|
|
|
@ -8,7 +8,5 @@
|
|||
struct task_struct;
|
||||
|
||||
void getrusage(struct task_struct *p, int who, struct rusage *ru);
|
||||
int do_prlimit(struct task_struct *tsk, unsigned int resource,
|
||||
struct rlimit *new_rlim, struct rlimit *old_rlim);
|
||||
|
||||
#endif
|
||||
|
|
119
kernel/sys.c
119
kernel/sys.c
|
@ -1424,6 +1424,68 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
|
|||
return errno;
|
||||
}
|
||||
|
||||
/* make sure you are allowed to change @tsk limits before calling this */
|
||||
static int do_prlimit(struct task_struct *tsk, unsigned int resource,
|
||||
struct rlimit *new_rlim, struct rlimit *old_rlim)
|
||||
{
|
||||
struct rlimit *rlim;
|
||||
int retval = 0;
|
||||
|
||||
if (resource >= RLIM_NLIMITS)
|
||||
return -EINVAL;
|
||||
if (new_rlim) {
|
||||
if (new_rlim->rlim_cur > new_rlim->rlim_max)
|
||||
return -EINVAL;
|
||||
if (resource == RLIMIT_NOFILE &&
|
||||
new_rlim->rlim_max > sysctl_nr_open)
|
||||
return -EPERM;
|
||||
}
|
||||
|
||||
/* Holding a refcount on tsk protects tsk->signal from disappearing. */
|
||||
rlim = tsk->signal->rlim + resource;
|
||||
task_lock(tsk->group_leader);
|
||||
if (new_rlim) {
|
||||
/*
|
||||
* Keep the capable check against init_user_ns until cgroups can
|
||||
* contain all limits.
|
||||
*/
|
||||
if (new_rlim->rlim_max > rlim->rlim_max &&
|
||||
!capable(CAP_SYS_RESOURCE))
|
||||
retval = -EPERM;
|
||||
if (!retval)
|
||||
retval = security_task_setrlimit(tsk, resource, new_rlim);
|
||||
}
|
||||
if (!retval) {
|
||||
if (old_rlim)
|
||||
*old_rlim = *rlim;
|
||||
if (new_rlim)
|
||||
*rlim = *new_rlim;
|
||||
}
|
||||
task_unlock(tsk->group_leader);
|
||||
|
||||
/*
|
||||
* RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
|
||||
* infinite. In case of RLIM_INFINITY the posix CPU timer code
|
||||
* ignores the rlimit.
|
||||
*/
|
||||
if (!retval && new_rlim && resource == RLIMIT_CPU &&
|
||||
new_rlim->rlim_cur != RLIM_INFINITY &&
|
||||
IS_ENABLED(CONFIG_POSIX_TIMERS)) {
|
||||
/*
|
||||
* update_rlimit_cpu can fail if the task is exiting, but there
|
||||
* may be other tasks in the thread group that are not exiting,
|
||||
* and they need their cpu timers adjusted.
|
||||
*
|
||||
* The group_leader is the last task to be released, so if we
|
||||
* cannot update_rlimit_cpu on it, then the entire process is
|
||||
* exiting and we do not need to update at all.
|
||||
*/
|
||||
update_rlimit_cpu(tsk->group_leader, new_rlim->rlim_cur);
|
||||
}
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
|
||||
{
|
||||
struct rlimit value;
|
||||
|
@ -1567,63 +1629,6 @@ static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
|
|||
rlim->rlim_max = (unsigned long)rlim64->rlim_max;
|
||||
}
|
||||
|
||||
/* make sure you are allowed to change @tsk limits before calling this */
|
||||
int do_prlimit(struct task_struct *tsk, unsigned int resource,
|
||||
struct rlimit *new_rlim, struct rlimit *old_rlim)
|
||||
{
|
||||
struct rlimit *rlim;
|
||||
int retval = 0;
|
||||
|
||||
if (resource >= RLIM_NLIMITS)
|
||||
return -EINVAL;
|
||||
if (new_rlim) {
|
||||
if (new_rlim->rlim_cur > new_rlim->rlim_max)
|
||||
return -EINVAL;
|
||||
if (resource == RLIMIT_NOFILE &&
|
||||
new_rlim->rlim_max > sysctl_nr_open)
|
||||
return -EPERM;
|
||||
}
|
||||
|
||||
/* protect tsk->signal and tsk->sighand from disappearing */
|
||||
read_lock(&tasklist_lock);
|
||||
if (!tsk->sighand) {
|
||||
retval = -ESRCH;
|
||||
goto out;
|
||||
}
|
||||
|
||||
rlim = tsk->signal->rlim + resource;
|
||||
task_lock(tsk->group_leader);
|
||||
if (new_rlim) {
|
||||
/* Keep the capable check against init_user_ns until
|
||||
cgroups can contain all limits */
|
||||
if (new_rlim->rlim_max > rlim->rlim_max &&
|
||||
!capable(CAP_SYS_RESOURCE))
|
||||
retval = -EPERM;
|
||||
if (!retval)
|
||||
retval = security_task_setrlimit(tsk, resource, new_rlim);
|
||||
}
|
||||
if (!retval) {
|
||||
if (old_rlim)
|
||||
*old_rlim = *rlim;
|
||||
if (new_rlim)
|
||||
*rlim = *new_rlim;
|
||||
}
|
||||
task_unlock(tsk->group_leader);
|
||||
|
||||
/*
|
||||
* RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
|
||||
* infinite. In case of RLIM_INFINITY the posix CPU timer code
|
||||
* ignores the rlimit.
|
||||
*/
|
||||
if (!retval && new_rlim && resource == RLIMIT_CPU &&
|
||||
new_rlim->rlim_cur != RLIM_INFINITY &&
|
||||
IS_ENABLED(CONFIG_POSIX_TIMERS))
|
||||
update_rlimit_cpu(tsk, new_rlim->rlim_cur);
|
||||
out:
|
||||
read_unlock(&tasklist_lock);
|
||||
return retval;
|
||||
}
|
||||
|
||||
/* rcu lock must be held */
|
||||
static int check_prlimit_permission(struct task_struct *task,
|
||||
unsigned int flags)
|
||||
|
|
|
@ -34,14 +34,20 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
|
|||
* tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
|
||||
* necessary. Needs siglock protection since other code may update the
|
||||
* expiration cache as well.
|
||||
*
|
||||
* Returns 0 on success, -ESRCH on failure. Can fail if the task is exiting and
|
||||
* we cannot lock_task_sighand. Cannot fail if task is current.
|
||||
*/
|
||||
void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
|
||||
int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
|
||||
{
|
||||
u64 nsecs = rlim_new * NSEC_PER_SEC;
|
||||
unsigned long irq_fl;
|
||||
|
||||
spin_lock_irq(&task->sighand->siglock);
|
||||
if (!lock_task_sighand(task, &irq_fl))
|
||||
return -ESRCH;
|
||||
set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL);
|
||||
spin_unlock_irq(&task->sighand->siglock);
|
||||
unlock_task_sighand(task, &irq_fl);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
Loading…
Reference in New Issue