A set of posix CPU timer changes which allows to defer the heavy work of
posix CPU timers into task work context. The tick interrupt is reduced to a quick check which queues the work which is doing the heavy lifting before returning to user space or going back to guest mode. Moving this out is deferring the signal delivery slightly but posix CPU timers are inaccurate by nature as they depend on the tick so there is no real damage. The relevant test cases all passed. This lifts the last offender for RT out of the hard interrupt context tick handler, but it also has the general benefit that the actual heavy work is accounted to the task/process and not to the tick interrupt itself. Further optimizations are possible to break long sighand lock hold and interrupt disabled (on !RT kernels) times when a massive amount of posix CPU timers (which are unpriviledged) is armed for a task/process. This is currently only enabled for x86 because the architecture has to ensure that task work is handled in KVM before entering a guest, which was just established for x86 with the new common entry/exit code which got merged post 5.8 and is not the case for other KVM architectures. -----BEGIN PGP SIGNATURE----- iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAl82sRkTHHRnbHhAbGlu dXRyb25peC5kZQAKCRCmGPVMDXSYoUs2D/9IZuALnVXtnvsOQh5uMRpxr/I6tpQm KJSRkcSSne9rIV3dQlswDdaT7bGibd7pbKQOnlA0vc37vDwaJHEzmTOJGpHpHnMA fHH2QP3LL2oZ1d7DG6eNJESCmaFBcaYXNbKtluOWQzHQhd9P8yHb4N+kzfxHK0Fr uNd+cd6T658xPsNOLaLP3MG2Yz0rVt2F5c1v8n78NfibeKckYhPov8cwVrf2WGWr XFHKorx4lXZ+vFwKEeZ7qQtqvAsLDixgMkFfY2GGSPhd1AMAaIUICZgsdEj2gg7H YK+lwA0uoqPaXshOCmdkCLkfPA7BRmAySWE7jUPbIvRqM94Uapk9+4CqjgiH1Qs+ T8CWbcZk8tZACFrouhZkhrnjUTev/vE7oirsjn26DRY68/Ec7llpCOjvVA7HZWqN vJ/BN35IufA7WEkf2TWNv5mg1zIlHI0O17zDifFq4g2VKFDVvQB0QYWlvug/eAu9 zYNX3WwA/IP8C9EOHZt54e6AKH8F3dT04oLFUkmRIcVKv1SEbdFufVfV7RavPEwK P21JNXPDdd0aLUO7ksqyQN7pyR3puGXSCb5NAPtZY6UWSMN4G/3SVry3mJa/0BJd mn+uYGpo9vmceh90vAHBoGIena/pez/PyRLWgGeT9jMjk95rNY0sEhaLEAOF9AR5 ck+3K2rY0S3wwQ== =Reot -----END PGP SIGNATURE----- Merge tag 'timers-core-2020-08-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull more timer updates from Thomas Gleixner: "A set of posix CPU timer changes which allows to defer the heavy work of posix CPU timers into task work context. The tick interrupt is reduced to a quick check which queues the work which is doing the heavy lifting before returning to user space or going back to guest mode. Moving this out is deferring the signal delivery slightly but posix CPU timers are inaccurate by nature as they depend on the tick so there is no real damage. The relevant test cases all passed. This lifts the last offender for RT out of the hard interrupt context tick handler, but it also has the general benefit that the actual heavy work is accounted to the task/process and not to the tick interrupt itself. Further optimizations are possible to break long sighand lock hold and interrupt disabled (on !RT kernels) times when a massive amount of posix CPU timers (which are unpriviledged) is armed for a task/process. This is currently only enabled for x86 because the architecture has to ensure that task work is handled in KVM before entering a guest, which was just established for x86 with the new common entry/exit code which got merged post 5.8 and is not the case for other KVM architectures" * tag 'timers-core-2020-08-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86: Select POSIX_CPU_TIMERS_TASK_WORK posix-cpu-timers: Provide mechanisms to defer timer handling to task_work posix-cpu-timers: Split run_posix_cpu_timers()
This commit is contained in:
commit
b6b178e38f
|
@ -209,6 +209,7 @@ config X86
|
||||||
select HAVE_PERF_REGS
|
select HAVE_PERF_REGS
|
||||||
select HAVE_PERF_USER_STACK_DUMP
|
select HAVE_PERF_USER_STACK_DUMP
|
||||||
select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
|
select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
|
||||||
|
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
|
||||||
select HAVE_REGS_AND_STACK_ACCESS_API
|
select HAVE_REGS_AND_STACK_ACCESS_API
|
||||||
select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
|
select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
|
||||||
select HAVE_FUNCTION_ARG_ACCESS_API
|
select HAVE_FUNCTION_ARG_ACCESS_API
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
#include <linux/list.h>
|
#include <linux/list.h>
|
||||||
#include <linux/alarmtimer.h>
|
#include <linux/alarmtimer.h>
|
||||||
#include <linux/timerqueue.h>
|
#include <linux/timerqueue.h>
|
||||||
|
#include <linux/task_work.h>
|
||||||
|
|
||||||
struct kernel_siginfo;
|
struct kernel_siginfo;
|
||||||
struct task_struct;
|
struct task_struct;
|
||||||
|
@ -125,6 +126,16 @@ struct posix_cputimers {
|
||||||
unsigned int expiry_active;
|
unsigned int expiry_active;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* posix_cputimers_work - Container for task work based posix CPU timer expiry
|
||||||
|
* @work: The task work to be scheduled
|
||||||
|
* @scheduled: @work has been scheduled already, no further processing
|
||||||
|
*/
|
||||||
|
struct posix_cputimers_work {
|
||||||
|
struct callback_head work;
|
||||||
|
unsigned int scheduled;
|
||||||
|
};
|
||||||
|
|
||||||
static inline void posix_cputimers_init(struct posix_cputimers *pct)
|
static inline void posix_cputimers_init(struct posix_cputimers *pct)
|
||||||
{
|
{
|
||||||
memset(pct, 0, sizeof(*pct));
|
memset(pct, 0, sizeof(*pct));
|
||||||
|
@ -165,6 +176,12 @@ static inline void posix_cputimers_group_init(struct posix_cputimers *pct,
|
||||||
u64 cpu_limit) { }
|
u64 cpu_limit) { }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
|
||||||
|
void posix_cputimers_init_work(void);
|
||||||
|
#else
|
||||||
|
static inline void posix_cputimers_init_work(void) { }
|
||||||
|
#endif
|
||||||
|
|
||||||
#define REQUEUE_PENDING 1
|
#define REQUEUE_PENDING 1
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -890,6 +890,10 @@ struct task_struct {
|
||||||
/* Empty if CONFIG_POSIX_CPUTIMERS=n */
|
/* Empty if CONFIG_POSIX_CPUTIMERS=n */
|
||||||
struct posix_cputimers posix_cputimers;
|
struct posix_cputimers posix_cputimers;
|
||||||
|
|
||||||
|
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
|
||||||
|
struct posix_cputimers_work posix_cputimers_work;
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Process credentials: */
|
/* Process credentials: */
|
||||||
|
|
||||||
/* Tracer's credentials at attach: */
|
/* Tracer's credentials at attach: */
|
||||||
|
|
|
@ -52,6 +52,15 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST
|
||||||
config GENERIC_CMOS_UPDATE
|
config GENERIC_CMOS_UPDATE
|
||||||
bool
|
bool
|
||||||
|
|
||||||
|
# Select to handle posix CPU timers from task_work
|
||||||
|
# and not from the timer interrupt context
|
||||||
|
config HAVE_POSIX_CPU_TIMERS_TASK_WORK
|
||||||
|
bool
|
||||||
|
|
||||||
|
config POSIX_CPU_TIMERS_TASK_WORK
|
||||||
|
bool
|
||||||
|
default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK
|
||||||
|
|
||||||
if GENERIC_CLOCKEVENTS
|
if GENERIC_CLOCKEVENTS
|
||||||
menu "Timers subsystem"
|
menu "Timers subsystem"
|
||||||
|
|
||||||
|
|
|
@ -377,6 +377,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
|
||||||
*/
|
*/
|
||||||
static int posix_cpu_timer_create(struct k_itimer *new_timer)
|
static int posix_cpu_timer_create(struct k_itimer *new_timer)
|
||||||
{
|
{
|
||||||
|
static struct lock_class_key posix_cpu_timers_key;
|
||||||
struct pid *pid;
|
struct pid *pid;
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
|
@ -386,6 +387,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If posix timer expiry is handled in task work context then
|
||||||
|
* timer::it_lock can be taken without disabling interrupts as all
|
||||||
|
* other locking happens in task context. This requires a seperate
|
||||||
|
* lock class key otherwise regular posix timer expiry would record
|
||||||
|
* the lock class being taken in interrupt context and generate a
|
||||||
|
* false positive warning.
|
||||||
|
*/
|
||||||
|
if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK))
|
||||||
|
lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key);
|
||||||
|
|
||||||
new_timer->kclock = &clock_posix_cpu;
|
new_timer->kclock = &clock_posix_cpu;
|
||||||
timerqueue_init(&new_timer->it.cpu.node);
|
timerqueue_init(&new_timer->it.cpu.node);
|
||||||
new_timer->it.cpu.pid = get_pid(pid);
|
new_timer->it.cpu.pid = get_pid(pid);
|
||||||
|
@ -1080,43 +1092,163 @@ static inline bool fastpath_timer_check(struct task_struct *tsk)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
static void handle_posix_cpu_timers(struct task_struct *tsk);
|
||||||
* This is called from the timer interrupt handler. The irq handler has
|
|
||||||
* already updated our counts. We need to check if any timers fire now.
|
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
|
||||||
* Interrupts are disabled.
|
static void posix_cpu_timers_work(struct callback_head *work)
|
||||||
*/
|
{
|
||||||
void run_posix_cpu_timers(void)
|
handle_posix_cpu_timers(current);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initialize posix CPU timers task work in init task. Out of line to
|
||||||
|
* keep the callback static and to avoid header recursion hell.
|
||||||
|
*/
|
||||||
|
void __init posix_cputimers_init_work(void)
|
||||||
|
{
|
||||||
|
init_task_work(¤t->posix_cputimers_work.work,
|
||||||
|
posix_cpu_timers_work);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note: All operations on tsk->posix_cputimer_work.scheduled happen either
|
||||||
|
* in hard interrupt context or in task context with interrupts
|
||||||
|
* disabled. Aside of that the writer/reader interaction is always in the
|
||||||
|
* context of the current task, which means they are strict per CPU.
|
||||||
|
*/
|
||||||
|
static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
|
||||||
|
{
|
||||||
|
return tsk->posix_cputimers_work.scheduled;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void __run_posix_cpu_timers(struct task_struct *tsk)
|
||||||
|
{
|
||||||
|
if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled))
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* Schedule task work to actually expire the timers */
|
||||||
|
tsk->posix_cputimers_work.scheduled = true;
|
||||||
|
task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
|
||||||
|
unsigned long start)
|
||||||
|
{
|
||||||
|
bool ret = true;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* On !RT kernels interrupts are disabled while collecting expired
|
||||||
|
* timers, so no tick can happen and the fast path check can be
|
||||||
|
* reenabled without further checks.
|
||||||
|
*/
|
||||||
|
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
|
||||||
|
tsk->posix_cputimers_work.scheduled = false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* On RT enabled kernels ticks can happen while the expired timers
|
||||||
|
* are collected under sighand lock. But any tick which observes
|
||||||
|
* the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath
|
||||||
|
* checks. So reenabling the tick work has do be done carefully:
|
||||||
|
*
|
||||||
|
* Disable interrupts and run the fast path check if jiffies have
|
||||||
|
* advanced since the collecting of expired timers started. If
|
||||||
|
* jiffies have not advanced or the fast path check did not find
|
||||||
|
* newly expired timers, reenable the fast path check in the timer
|
||||||
|
* interrupt. If there are newly expired timers, return false and
|
||||||
|
* let the collection loop repeat.
|
||||||
|
*/
|
||||||
|
local_irq_disable();
|
||||||
|
if (start != jiffies && fastpath_timer_check(tsk))
|
||||||
|
ret = false;
|
||||||
|
else
|
||||||
|
tsk->posix_cputimers_work.scheduled = false;
|
||||||
|
local_irq_enable();
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
|
||||||
|
static inline void __run_posix_cpu_timers(struct task_struct *tsk)
|
||||||
|
{
|
||||||
|
lockdep_posixtimer_enter();
|
||||||
|
handle_posix_cpu_timers(tsk);
|
||||||
|
lockdep_posixtimer_exit();
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
|
||||||
|
unsigned long start)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
|
||||||
|
|
||||||
|
static void handle_posix_cpu_timers(struct task_struct *tsk)
|
||||||
{
|
{
|
||||||
struct task_struct *tsk = current;
|
|
||||||
struct k_itimer *timer, *next;
|
struct k_itimer *timer, *next;
|
||||||
unsigned long flags;
|
unsigned long flags, start;
|
||||||
LIST_HEAD(firing);
|
LIST_HEAD(firing);
|
||||||
|
|
||||||
lockdep_assert_irqs_disabled();
|
if (!lock_task_sighand(tsk, &flags))
|
||||||
|
|
||||||
/*
|
|
||||||
* The fast path checks that there are no expired thread or thread
|
|
||||||
* group timers. If that's so, just return.
|
|
||||||
*/
|
|
||||||
if (!fastpath_timer_check(tsk))
|
|
||||||
return;
|
return;
|
||||||
|
|
||||||
lockdep_posixtimer_enter();
|
do {
|
||||||
if (!lock_task_sighand(tsk, &flags)) {
|
/*
|
||||||
lockdep_posixtimer_exit();
|
* On RT locking sighand lock does not disable interrupts,
|
||||||
return;
|
* so this needs to be careful vs. ticks. Store the current
|
||||||
}
|
* jiffies value.
|
||||||
/*
|
*/
|
||||||
* Here we take off tsk->signal->cpu_timers[N] and
|
start = READ_ONCE(jiffies);
|
||||||
* tsk->cpu_timers[N] all the timers that are firing, and
|
barrier();
|
||||||
* put them on the firing list.
|
|
||||||
*/
|
|
||||||
check_thread_timers(tsk, &firing);
|
|
||||||
|
|
||||||
check_process_timers(tsk, &firing);
|
/*
|
||||||
|
* Here we take off tsk->signal->cpu_timers[N] and
|
||||||
|
* tsk->cpu_timers[N] all the timers that are firing, and
|
||||||
|
* put them on the firing list.
|
||||||
|
*/
|
||||||
|
check_thread_timers(tsk, &firing);
|
||||||
|
|
||||||
|
check_process_timers(tsk, &firing);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The above timer checks have updated the exipry cache and
|
||||||
|
* because nothing can have queued or modified timers after
|
||||||
|
* sighand lock was taken above it is guaranteed to be
|
||||||
|
* consistent. So the next timer interrupt fastpath check
|
||||||
|
* will find valid data.
|
||||||
|
*
|
||||||
|
* If timer expiry runs in the timer interrupt context then
|
||||||
|
* the loop is not relevant as timers will be directly
|
||||||
|
* expired in interrupt context. The stub function below
|
||||||
|
* returns always true which allows the compiler to
|
||||||
|
* optimize the loop out.
|
||||||
|
*
|
||||||
|
* If timer expiry is deferred to task work context then
|
||||||
|
* the following rules apply:
|
||||||
|
*
|
||||||
|
* - On !RT kernels no tick can have happened on this CPU
|
||||||
|
* after sighand lock was acquired because interrupts are
|
||||||
|
* disabled. So reenabling task work before dropping
|
||||||
|
* sighand lock and reenabling interrupts is race free.
|
||||||
|
*
|
||||||
|
* - On RT kernels ticks might have happened but the tick
|
||||||
|
* work ignored posix CPU timer handling because the
|
||||||
|
* CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work
|
||||||
|
* must be done very carefully including a check whether
|
||||||
|
* ticks have happened since the start of the timer
|
||||||
|
* expiry checks. posix_cpu_timers_enable_work() takes
|
||||||
|
* care of that and eventually lets the expiry checks
|
||||||
|
* run again.
|
||||||
|
*/
|
||||||
|
} while (!posix_cpu_timers_enable_work(tsk, start));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We must release these locks before taking any timer's lock.
|
* We must release sighand lock before taking any timer's lock.
|
||||||
* There is a potential race with timer deletion here, as the
|
* There is a potential race with timer deletion here, as the
|
||||||
* siglock now protects our private firing list. We have set
|
* siglock now protects our private firing list. We have set
|
||||||
* the firing flag in each timer, so that a deletion attempt
|
* the firing flag in each timer, so that a deletion attempt
|
||||||
|
@ -1134,6 +1266,13 @@ void run_posix_cpu_timers(void)
|
||||||
list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
|
list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
|
||||||
int cpu_firing;
|
int cpu_firing;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* spin_lock() is sufficient here even independent of the
|
||||||
|
* expiry context. If expiry happens in hard interrupt
|
||||||
|
* context it's obvious. For task work context it's safe
|
||||||
|
* because all other operations on timer::it_lock happen in
|
||||||
|
* task context (syscall or exit).
|
||||||
|
*/
|
||||||
spin_lock(&timer->it_lock);
|
spin_lock(&timer->it_lock);
|
||||||
list_del_init(&timer->it.cpu.elist);
|
list_del_init(&timer->it.cpu.elist);
|
||||||
cpu_firing = timer->it.cpu.firing;
|
cpu_firing = timer->it.cpu.firing;
|
||||||
|
@ -1147,7 +1286,34 @@ void run_posix_cpu_timers(void)
|
||||||
cpu_timer_fire(timer);
|
cpu_timer_fire(timer);
|
||||||
spin_unlock(&timer->it_lock);
|
spin_unlock(&timer->it_lock);
|
||||||
}
|
}
|
||||||
lockdep_posixtimer_exit();
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is called from the timer interrupt handler. The irq handler has
|
||||||
|
* already updated our counts. We need to check if any timers fire now.
|
||||||
|
* Interrupts are disabled.
|
||||||
|
*/
|
||||||
|
void run_posix_cpu_timers(void)
|
||||||
|
{
|
||||||
|
struct task_struct *tsk = current;
|
||||||
|
|
||||||
|
lockdep_assert_irqs_disabled();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the actual expiry is deferred to task work context and the
|
||||||
|
* work is already scheduled there is no point to do anything here.
|
||||||
|
*/
|
||||||
|
if (posix_cpu_timers_work_scheduled(tsk))
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The fast path checks that there are no expired thread or thread
|
||||||
|
* group timers. If that's so, just return.
|
||||||
|
*/
|
||||||
|
if (!fastpath_timer_check(tsk))
|
||||||
|
return;
|
||||||
|
|
||||||
|
__run_posix_cpu_timers(tsk);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -2017,6 +2017,7 @@ static void __init init_timer_cpus(void)
|
||||||
void __init init_timers(void)
|
void __init init_timers(void)
|
||||||
{
|
{
|
||||||
init_timer_cpus();
|
init_timer_cpus();
|
||||||
|
posix_cputimers_init_work();
|
||||||
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
|
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue