sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath
commit ddae0ca2a8fe12d0e24ab10ba759c3fbd755ada8 upstream. It was reported that in moving to 6.1, a larger then 10% regression was seen in the performance of clock_gettime(CLOCK_THREAD_CPUTIME_ID,...). Using a simple reproducer, I found: 5.10: 100000000 calls in 24345994193 ns => 243.460 ns per call 100000000 calls in 24288172050 ns => 242.882 ns per call 100000000 calls in 24289135225 ns => 242.891 ns per call 6.1: 100000000 calls in 28248646742 ns => 282.486 ns per call 100000000 calls in 28227055067 ns => 282.271 ns per call 100000000 calls in 28177471287 ns => 281.775 ns per call The cause of this was finally narrowed down to the addition of psi_account_irqtime() in update_rq_clock_task(), in commit52b1364ba0
("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure"). In my initial attempt to resolve this, I leaned towards moving all accounting work out of the clock_gettime() call path, but it wasn't very pretty, so it will have to wait for a later deeper rework. Instead, Peter shared this approach: Rework psi_account_irqtime() to use its own psi_irq_time base for accounting, and move it out of the hotpath, calling it instead from sched_tick() and __schedule(). In testing this, we found the importance of ensuring psi_account_irqtime() is run under the rq_lock, which Johannes Weiner helpfully explained, so also add some lockdep annotations to make that requirement clear. With this change the performance is back in-line with 5.10: 6.1+fix: 100000000 calls in 24297324597 ns => 242.973 ns per call 100000000 calls in 24318869234 ns => 243.189 ns per call 100000000 calls in 24291564588 ns => 242.916 ns per call Reported-by: Jimmy Shiu <jimmyshiu@google.com> Originally-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: John Stultz <jstultz@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev> Reviewed-by: Qais Yousef <qyousef@layalina.io> Link: https://lore.kernel.org/r/20240618215909.4099720-1-jstultz@google.com Fixes:52b1364ba0
("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure") [jstultz: Fixed up minor collisions w/ 6.6-stable] Signed-off-by: John Stultz <jstultz@google.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
parent
4dc200e315
commit
448a2500d1
|
@ -722,7 +722,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
|
|||
|
||||
rq->prev_irq_time += irq_delta;
|
||||
delta -= irq_delta;
|
||||
psi_account_irqtime(rq->curr, irq_delta);
|
||||
delayacct_irq(rq->curr, irq_delta);
|
||||
#endif
|
||||
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
|
||||
|
@ -5641,7 +5640,7 @@ void scheduler_tick(void)
|
|||
{
|
||||
int cpu = smp_processor_id();
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
struct task_struct *curr = rq->curr;
|
||||
struct task_struct *curr;
|
||||
struct rq_flags rf;
|
||||
unsigned long thermal_pressure;
|
||||
u64 resched_latency;
|
||||
|
@ -5653,6 +5652,9 @@ void scheduler_tick(void)
|
|||
|
||||
rq_lock(rq, &rf);
|
||||
|
||||
curr = rq->curr;
|
||||
psi_account_irqtime(rq, curr, NULL);
|
||||
|
||||
update_rq_clock(rq);
|
||||
thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
|
||||
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
|
||||
|
@ -6690,6 +6692,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
|
|||
++*switch_count;
|
||||
|
||||
migrate_disable_switch(rq, prev);
|
||||
psi_account_irqtime(rq, prev, next);
|
||||
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
|
||||
|
||||
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
|
||||
|
|
|
@ -784,6 +784,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
|||
enum psi_states s;
|
||||
u32 state_mask;
|
||||
|
||||
lockdep_assert_rq_held(cpu_rq(cpu));
|
||||
groupc = per_cpu_ptr(group->pcpu, cpu);
|
||||
|
||||
/*
|
||||
|
@ -1002,19 +1003,29 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
|||
}
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
void psi_account_irqtime(struct task_struct *task, u32 delta)
|
||||
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
|
||||
{
|
||||
int cpu = task_cpu(task);
|
||||
int cpu = task_cpu(curr);
|
||||
struct psi_group *group;
|
||||
struct psi_group_cpu *groupc;
|
||||
u64 now;
|
||||
u64 now, irq;
|
||||
s64 delta;
|
||||
|
||||
if (!task->pid)
|
||||
if (!curr->pid)
|
||||
return;
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
group = task_psi_group(curr);
|
||||
if (prev && task_psi_group(prev) == group)
|
||||
return;
|
||||
|
||||
now = cpu_clock(cpu);
|
||||
irq = irq_time_read(cpu);
|
||||
delta = (s64)(irq - rq->psi_irq_time);
|
||||
if (delta < 0)
|
||||
return;
|
||||
rq->psi_irq_time = irq;
|
||||
|
||||
group = task_psi_group(task);
|
||||
do {
|
||||
if (!group->enabled)
|
||||
continue;
|
||||
|
|
|
@ -1094,6 +1094,7 @@ struct rq {
|
|||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
u64 prev_irq_time;
|
||||
u64 psi_irq_time;
|
||||
#endif
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
u64 prev_steal_time;
|
||||
|
|
|
@ -110,8 +110,12 @@ __schedstats_from_se(struct sched_entity *se)
|
|||
void psi_task_change(struct task_struct *task, int clear, int set);
|
||||
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
bool sleep);
|
||||
void psi_account_irqtime(struct task_struct *task, u32 delta);
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
|
||||
#else
|
||||
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
|
||||
struct task_struct *prev) {}
|
||||
#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
|
||||
/*
|
||||
* PSI tracks state that persists across sleeps, such as iowaits and
|
||||
* memory stalls. As a result, it has to distinguish between sleeps,
|
||||
|
@ -192,7 +196,8 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
|
|||
static inline void psi_sched_switch(struct task_struct *prev,
|
||||
struct task_struct *next,
|
||||
bool sleep) {}
|
||||
static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
|
||||
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
|
||||
struct task_struct *prev) {}
|
||||
#endif /* CONFIG_PSI */
|
||||
|
||||
#ifdef CONFIG_SCHED_INFO
|
||||
|
|
Loading…
Reference in New Issue