sched: Normalize tg load contributions against runnable time

Entities of equal weight should receive equitable distribution of cpu time.
This is challenging in the case of a task_group's shares as execution may be
occurring on multiple cpus simultaneously.

To handle this we divide up the shares into weights proportionate with the load
on each cfs_rq.  This does not however, account for the fact that the sum of
the parts may be less than one cpu and so we need to normalize:
  load(tg) = min(runnable_avg(tg), 1) * tg->shares
Where runnable_avg is the aggregate time in which the task_group had runnable
children.

Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Ben Segall <bsegall@google.com>.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20120823141506.930124292@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Paul Turner 2012-10-04 13:18:31 +02:00 committed by Ingo Molnar
parent 8165e145ce
commit bb17f65571
3 changed files with 62 additions and 0 deletions

View File

@ -234,6 +234,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
atomic64_read(&cfs_rq->tg->load_avg)); atomic64_read(&cfs_rq->tg->load_avg));
SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
cfs_rq->tg_load_contrib); cfs_rq->tg_load_contrib);
SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
cfs_rq->tg_runnable_contrib);
SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
atomic_read(&cfs_rq->tg->runnable_avg));
#endif #endif
print_cfs_group_stats(m, cpu, cfs_rq->tg); print_cfs_group_stats(m, cpu, cfs_rq->tg);

View File

@ -1118,19 +1118,73 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
} }
} }
/*
* Aggregate cfs_rq runnable averages into an equivalent task_group
* representation for computing load contributions.
*/
static inline void __update_tg_runnable_avg(struct sched_avg *sa,
struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
long contrib;
/* The fraction of a cpu used by this cfs_rq */
contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
sa->runnable_avg_period + 1);
contrib -= cfs_rq->tg_runnable_contrib;
if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
atomic_add(contrib, &tg->runnable_avg);
cfs_rq->tg_runnable_contrib += contrib;
}
}
static inline void __update_group_entity_contrib(struct sched_entity *se) static inline void __update_group_entity_contrib(struct sched_entity *se)
{ {
struct cfs_rq *cfs_rq = group_cfs_rq(se); struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg = cfs_rq->tg; struct task_group *tg = cfs_rq->tg;
int runnable_avg;
u64 contrib; u64 contrib;
contrib = cfs_rq->tg_load_contrib * tg->shares; contrib = cfs_rq->tg_load_contrib * tg->shares;
se->avg.load_avg_contrib = div64_u64(contrib, se->avg.load_avg_contrib = div64_u64(contrib,
atomic64_read(&tg->load_avg) + 1); atomic64_read(&tg->load_avg) + 1);
/*
* For group entities we need to compute a correction term in the case
* that they are consuming <1 cpu so that we would contribute the same
* load as a task of equal weight.
*
* Explicitly co-ordinating this measurement would be expensive, but
* fortunately the sum of each cpus contribution forms a usable
* lower-bound on the true value.
*
* Consider the aggregate of 2 contributions. Either they are disjoint
* (and the sum represents true value) or they are disjoint and we are
* understating by the aggregate of their overlap.
*
* Extending this to N cpus, for a given overlap, the maximum amount we
* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
* cpus that overlap for this interval and w_i is the interval width.
*
* On a small machine; the first term is well-bounded which bounds the
* total error since w_i is a subset of the period. Whereas on a
* larger machine, while this first term can be larger, if w_i is the
* of consequential size guaranteed to see n_i*w_i quickly converge to
* our upper bound of 1-cpu.
*/
runnable_avg = atomic_read(&tg->runnable_avg);
if (runnable_avg < NICE_0_LOAD) {
se->avg.load_avg_contrib *= runnable_avg;
se->avg.load_avg_contrib >>= NICE_0_SHIFT;
}
} }
#else #else
static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
int force_update) {} int force_update) {}
static inline void __update_tg_runnable_avg(struct sched_avg *sa,
struct cfs_rq *cfs_rq) {}
static inline void __update_group_entity_contrib(struct sched_entity *se) {} static inline void __update_group_entity_contrib(struct sched_entity *se) {}
#endif #endif
@ -1152,6 +1206,7 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
if (entity_is_task(se)) { if (entity_is_task(se)) {
__update_task_entity_contrib(se); __update_task_entity_contrib(se);
} else { } else {
__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
__update_group_entity_contrib(se); __update_group_entity_contrib(se);
} }
@ -1220,6 +1275,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
static inline void update_rq_runnable_avg(struct rq *rq, int runnable) static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
{ {
__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
} }
/* Add the load generated by se into cfs_rq's child load-average */ /* Add the load generated by se into cfs_rq's child load-average */

View File

@ -113,6 +113,7 @@ struct task_group {
atomic_t load_weight; atomic_t load_weight;
atomic64_t load_avg; atomic64_t load_avg;
atomic_t runnable_avg;
#endif #endif
#ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED
@ -234,6 +235,7 @@ struct cfs_rq {
atomic64_t decay_counter, removed_load; atomic64_t decay_counter, removed_load;
u64 last_decay; u64 last_decay;
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
u32 tg_runnable_contrib;
u64 tg_load_contrib; u64 tg_load_contrib;
#endif #endif
#endif #endif