sched/fair: Rework sched_fair time accounting

Christian suffers from a bad BIOS that wrecks his i5's TSC sync. This
results in him occasionally seeing time going backwards - which
crashes the scheduler ...

Most of our time accounting can actually handle that except the most
common one; the tick time update of sched_fair.

There is a further problem with that code; previously we assumed that
because we get a tick every TICK_NSEC our time delta could never
exceed 32bits and math was simpler.

However, ever since Frederic managed to get NO_HZ_FULL merged; this is
no longer the case since now a task can run for a long time indeed
without getting a tick. It only takes about ~4.2 seconds to overflow
our u32 in nanoseconds.

This means we not only need to better deal with time going backwards;
but also means we need to be able to deal with large deltas.

This patch reworks the entire code and uses mul_u64_u32_shr() as
proposed by Andy a long while ago.

We express our virtual time scale factor in a u32 multiplier and shift
right and the 32bit mul_u64_u32_shr() implementation reduces to a
single 32x32->64 multiply if the time delta is still short (common
case).

For 64bit a 64x64->128 multiply can be used if ARCH_SUPPORTS_INT128.

Reported-and-Tested-by: Christian Engelmayer <cengelma@gmx.at>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: fweisbec@gmail.com
Cc: Paul Turner <pjt@google.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20131118172706.GI3866@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Peter Zijlstra 2013-11-18 18:27:06 +01:00 committed by Ingo Molnar
parent be5e610c0f
commit 9dbdb15553
2 changed files with 67 additions and 82 deletions

View File

@ -930,7 +930,8 @@ struct pipe_inode_info;
struct uts_namespace; struct uts_namespace;
struct load_weight { struct load_weight {
unsigned long weight, inv_weight; unsigned long weight;
u32 inv_weight;
}; };
struct sched_avg { struct sched_avg {

View File

@ -178,59 +178,61 @@ void sched_init_granularity(void)
update_sysctl(); update_sysctl();
} }
#if BITS_PER_LONG == 32 #define WMULT_CONST (~0U)
# define WMULT_CONST (~0UL)
#else
# define WMULT_CONST (1UL << 32)
#endif
#define WMULT_SHIFT 32 #define WMULT_SHIFT 32
/* static void __update_inv_weight(struct load_weight *lw)
* Shift right and round:
*/
#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
/*
* delta *= weight / lw
*/
static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
struct load_weight *lw)
{ {
u64 tmp; unsigned long w;
/* if (likely(lw->inv_weight))
* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched return;
* entities since MIN_SHARES = 2. Treat weight as 1 if less than
* 2^SCHED_LOAD_RESOLUTION. w = scale_load_down(lw->weight);
*/
if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
tmp = (u64)delta_exec * scale_load_down(weight); lw->inv_weight = 1;
else if (unlikely(!w))
lw->inv_weight = WMULT_CONST;
else else
tmp = (u64)delta_exec; lw->inv_weight = WMULT_CONST / w;
}
if (!lw->inv_weight) { /*
unsigned long w = scale_load_down(lw->weight); * delta_exec * weight / lw.weight
* OR
* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
*
* Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
* we're guaranteed shift stays positive because inv_weight is guaranteed to
* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
*
* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
* weight/lw.weight <= 1, and therefore our shift will also be positive.
*/
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
u64 fact = scale_load_down(weight);
int shift = WMULT_SHIFT;
if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) __update_inv_weight(lw);
lw->inv_weight = 1;
else if (unlikely(!w)) if (unlikely(fact >> 32)) {
lw->inv_weight = WMULT_CONST; while (fact >> 32) {
else fact >>= 1;
lw->inv_weight = WMULT_CONST / w; shift--;
}
} }
/* /* hint to use a 32x32->64 mul */
* Check whether we'd overflow the 64-bit multiplication: fact = (u64)(u32)fact * lw->inv_weight;
*/
if (unlikely(tmp > WMULT_CONST))
tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
WMULT_SHIFT/2);
else
tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); while (fact >> 32) {
fact >>= 1;
shift--;
}
return mul_u64_u32_shr(delta_exec, fact, shift);
} }
@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */
static __always_inline static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
/************************************************************** /**************************************************************
* Scheduling class tree data structure manipulation methods: * Scheduling class tree data structure manipulation methods:
@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
/* /*
* delta /= w * delta /= w
*/ */
static inline unsigned long static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
calc_delta_fair(unsigned long delta, struct sched_entity *se)
{ {
if (unlikely(se->load.weight != NICE_0_LOAD)) if (unlikely(se->load.weight != NICE_0_LOAD))
delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
return delta; return delta;
} }
@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_add(&lw, se->load.weight); update_load_add(&lw, se->load.weight);
load = &lw; load = &lw;
} }
slice = calc_delta_mine(slice, se->load.weight, load); slice = __calc_delta(slice, se->load.weight, load);
} }
return slice; return slice;
} }
@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
#endif #endif
/* /*
* Update the current task's runtime statistics. Skip current tasks that * Update the current task's runtime statistics.
* are not in our scheduling class.
*/ */
static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
unsigned long delta_exec)
{
unsigned long delta_exec_weighted;
schedstat_set(curr->statistics.exec_max,
max((u64)delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
delta_exec_weighted = calc_delta_fair(delta_exec, curr);
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
}
static void update_curr(struct cfs_rq *cfs_rq) static void update_curr(struct cfs_rq *cfs_rq)
{ {
struct sched_entity *curr = cfs_rq->curr; struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_clock_task(rq_of(cfs_rq)); u64 now = rq_clock_task(rq_of(cfs_rq));
unsigned long delta_exec; u64 delta_exec;
if (unlikely(!curr)) if (unlikely(!curr))
return; return;
/* delta_exec = now - curr->exec_start;
* Get the amount of time the current task was running if (unlikely((s64)delta_exec <= 0))
* since the last time we changed load (this cannot
* overflow on 32 bits):
*/
delta_exec = (unsigned long)(now - curr->exec_start);
if (!delta_exec)
return; return;
__update_curr(cfs_rq, curr, delta_exec);
curr->exec_start = now; curr->exec_start = now;
schedstat_set(curr->statistics.exec_max,
max(delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) { if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr); struct task_struct *curtask = task_of(curr);
@ -3015,8 +3001,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
} }
} }
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
unsigned long delta_exec)
{ {
/* dock delta_exec before expiring quota (as it could span periods) */ /* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec; cfs_rq->runtime_remaining -= delta_exec;
@ -3034,7 +3019,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
} }
static __always_inline static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{ {
if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
return; return;
@ -3574,8 +3559,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
return rq_clock_task(rq_of(cfs_rq)); return rq_clock_task(rq_of(cfs_rq));
} }
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
unsigned long delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}