From 45ceebf77653975815d82fcf7cec0a164215ae11 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 19 Apr 2013 15:10:49 -0400 Subject: [PATCH 01/42] sched: Factor out load calculation code from sched/core.c --> sched/proc.c This large chunk of load calculation code can be easily divorced from the main core.c scheduler file, with only a couple prototypes and externs added to a kernel/sched header. Some recent commits expanded the code and the documentation of it, making it large enough to warrant separation. For example, see: 556061b, "sched/nohz: Fix rq->cpu_load[] calculations" 5aaa0b7, "sched/nohz: Fix rq->cpu_load calculations some more" 5167e8d, "sched/nohz: Rewrite and fix load-avg computation -- again" More importantly, it helps reduce the size of the main sched/core.c by yet another significant amount (~600 lines). Signed-off-by: Paul Gortmaker Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1366398650-31599-2-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Ingo Molnar --- kernel/sched/Makefile | 2 +- kernel/sched/core.c | 569 ----------------------------------------- kernel/sched/proc.c | 578 ++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 8 + 4 files changed, 587 insertions(+), 570 deletions(-) create mode 100644 kernel/sched/proc.c diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index deaf90e4a1de..54adcf35f495 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o +obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272fd..bfa7e77e0b50 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2056,575 +2056,6 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } -unsigned long this_cpu_load(void) -{ - struct rq *this = this_rq(); - return this->cpu_load[0]; -} - - -/* - * Global load-average calculations - * - * We take a distributed and async approach to calculating the global load-avg - * in order to minimize overhead. - * - * The global load average is an exponentially decaying average of nr_running + - * nr_uninterruptible. - * - * Once every LOAD_FREQ: - * - * nr_active = 0; - * for_each_possible_cpu(cpu) - * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; - * - * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) - * - * Due to a number of reasons the above turns in the mess below: - * - * - for_each_possible_cpu() is prohibitively expensive on machines with - * serious number of cpus, therefore we need to take a distributed approach - * to calculating nr_active. - * - * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 - * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } - * - * So assuming nr_active := 0 when we start out -- true per definition, we - * can simply take per-cpu deltas and fold those into a global accumulate - * to obtain the same result. See calc_load_fold_active(). - * - * Furthermore, in order to avoid synchronizing all per-cpu delta folding - * across the machine, we assume 10 ticks is sufficient time for every - * cpu to have completed this task. - * - * This places an upper-bound on the IRQ-off latency of the machine. Then - * again, being late doesn't loose the delta, just wrecks the sample. - * - * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because - * this would add another cross-cpu cacheline miss and atomic operation - * to the wakeup path. Instead we increment on whatever cpu the task ran - * when it went into uninterruptible state and decrement on whatever cpu - * did the wakeup. This means that only the sum of nr_uninterruptible over - * all cpus yields the correct result. - * - * This covers the NO_HZ=n code, for extra head-aches, see the comment below. - */ - -/* Variables and functions for calc_load */ -static atomic_long_t calc_load_tasks; -static unsigned long calc_load_update; -unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); /* should be removed */ - -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} - -static long calc_load_fold_active(struct rq *this_rq) -{ - long nr_active, delta = 0; - - nr_active = this_rq->nr_running; - nr_active += (long) this_rq->nr_uninterruptible; - - if (nr_active != this_rq->calc_load_active) { - delta = nr_active - this_rq->calc_load_active; - this_rq->calc_load_active = nr_active; - } - - return delta; -} - -/* - * a1 = a0 * e + a * (1 - e) - */ -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - load *= exp; - load += active * (FIXED_1 - exp); - load += 1UL << (FSHIFT - 1); - return load >> FSHIFT; -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * Handle NO_HZ for the global load-average. - * - * Since the above described distributed algorithm to compute the global - * load-average relies on per-cpu sampling from the tick, it is affected by - * NO_HZ. - * - * The basic idea is to fold the nr_active delta into a global idle-delta upon - * entering NO_HZ state such that we can include this as an 'extra' cpu delta - * when we read the global state. - * - * Obviously reality has to ruin such a delightfully simple scheme: - * - * - When we go NO_HZ idle during the window, we can negate our sample - * contribution, causing under-accounting. - * - * We avoid this by keeping two idle-delta counters and flipping them - * when the window starts, thus separating old and new NO_HZ load. - * - * The only trick is the slight shift in index flip for read vs write. - * - * 0s 5s 10s 15s - * +10 +10 +10 +10 - * |-|-----------|-|-----------|-|-----------|-| - * r:0 0 1 1 0 0 1 1 0 - * w:0 1 1 0 0 1 1 0 0 - * - * This ensures we'll fold the old idle contribution in this window while - * accumlating the new one. - * - * - When we wake up from NO_HZ idle during the window, we push up our - * contribution, since we effectively move our sample point to a known - * busy state. - * - * This is solved by pushing the window forward, and thus skipping the - * sample, for this cpu (effectively using the idle-delta for this cpu which - * was in effect at the time the window opened). This also solves the issue - * of having to deal with a cpu having been in NOHZ idle for multiple - * LOAD_FREQ intervals. - * - * When making the ILB scale, we should try to pull this in as well. - */ -static atomic_long_t calc_load_idle[2]; -static int calc_load_idx; - -static inline int calc_load_write_idx(void) -{ - int idx = calc_load_idx; - - /* - * See calc_global_nohz(), if we observe the new index, we also - * need to observe the new update time. - */ - smp_rmb(); - - /* - * If the folding window started, make sure we start writing in the - * next idle-delta. - */ - if (!time_before(jiffies, calc_load_update)) - idx++; - - return idx & 1; -} - -static inline int calc_load_read_idx(void) -{ - return calc_load_idx & 1; -} - -void calc_load_enter_idle(void) -{ - struct rq *this_rq = this_rq(); - long delta; - - /* - * We're going into NOHZ mode, if there's any pending delta, fold it - * into the pending idle delta. - */ - delta = calc_load_fold_active(this_rq); - if (delta) { - int idx = calc_load_write_idx(); - atomic_long_add(delta, &calc_load_idle[idx]); - } -} - -void calc_load_exit_idle(void) -{ - struct rq *this_rq = this_rq(); - - /* - * If we're still before the sample window, we're done. - */ - if (time_before(jiffies, this_rq->calc_load_update)) - return; - - /* - * We woke inside or after the sample window, this means we're already - * accounted through the nohz accounting, so skip the entire deal and - * sync up for the next window. - */ - this_rq->calc_load_update = calc_load_update; - if (time_before(jiffies, this_rq->calc_load_update + 10)) - this_rq->calc_load_update += LOAD_FREQ; -} - -static long calc_load_fold_idle(void) -{ - int idx = calc_load_read_idx(); - long delta = 0; - - if (atomic_long_read(&calc_load_idle[idx])) - delta = atomic_long_xchg(&calc_load_idle[idx], 0); - - return delta; -} - -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - -/* - * NO_HZ can leave us missing all per-cpu ticks calling - * calc_load_account_active(), but since an idle CPU folds its delta into - * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold - * in the pending idle delta if our idle period crossed a load cycle boundary. - * - * Once we've updated the global active value, we need to apply the exponential - * weights adjusted to the number of cycles missed. - */ -static void calc_global_nohz(void) -{ - long delta, active, n; - - if (!time_before(jiffies, calc_load_update + 10)) { - /* - * Catch-up, fold however many we are behind still - */ - delta = jiffies - calc_load_update - 10; - n = 1 + (delta / LOAD_FREQ); - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - - calc_load_update += n * LOAD_FREQ; - } - - /* - * Flip the idle index... - * - * Make sure we first write the new time then flip the index, so that - * calc_load_write_idx() will see the new time when it reads the new - * index, this avoids a double flip messing things up. - */ - smp_wmb(); - calc_load_idx++; -} -#else /* !CONFIG_NO_HZ_COMMON */ - -static inline long calc_load_fold_idle(void) { return 0; } -static inline void calc_global_nohz(void) { } - -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * calc_load - update the avenrun load estimates 10 ticks after the - * CPUs have updated calc_load_tasks. - */ -void calc_global_load(unsigned long ticks) -{ - long active, delta; - - if (time_before(jiffies, calc_load_update + 10)) - return; - - /* - * Fold the 'old' idle-delta to include all NO_HZ cpus. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load(avenrun[0], EXP_1, active); - avenrun[1] = calc_load(avenrun[1], EXP_5, active); - avenrun[2] = calc_load(avenrun[2], EXP_15, active); - - calc_load_update += LOAD_FREQ; - - /* - * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. - */ - calc_global_nohz(); -} - -/* - * Called from update_cpu_load() to periodically update this CPU's - * active count. - */ -static void calc_load_account_active(struct rq *this_rq) -{ - long delta; - - if (time_before(jiffies, this_rq->calc_load_update)) - return; - - delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - this_rq->calc_load_update += LOAD_FREQ; -} - -/* - * End of global load-average stuff - */ - -/* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load - * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load - * - * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load - * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. - */ -#define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. - */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) -{ - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - old_load = decay_load_missed(old_load, pending_updates - 1, i); - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } - - sched_avg_update(this_rq); -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - -/* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -void update_idle_cpu_load(struct rq *this_rq) -{ - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = this_rq->load.weight; - unsigned long pending_updates; - - /* - * bail if there's load or we're actually up-to-date. - */ - if (load || curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates); -} - -/* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. - */ -void update_cpu_load_nohz(void) -{ - struct rq *this_rq = this_rq(); - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long pending_updates; - - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. - */ - __update_cpu_load(this_rq, 0, pending_updates); - } - raw_spin_unlock(&this_rq->lock); -} -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * Called from scheduler_tick() - */ -static void update_cpu_load_active(struct rq *this_rq) -{ - /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). - */ - this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, this_rq->load.weight, 1); - - calc_load_account_active(this_rq); -} - #ifdef CONFIG_SMP /* diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c new file mode 100644 index 000000000000..bb3a6a0b8623 --- /dev/null +++ b/kernel/sched/proc.c @@ -0,0 +1,578 @@ +/* + * kernel/sched/proc.c + * + * Kernel load calculations, forked from sched/core.c + */ + +#include + +#include "sched.h" + +unsigned long this_cpu_load(void) +{ + struct rq *this = this_rq(); + return this->cpu_load[0]; +} + + +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + +/* Variables and functions for calc_load */ +atomic_long_t calc_load_tasks; +unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} + +long calc_load_fold_active(struct rq *this_rq) +{ + long nr_active, delta = 0; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + } + + return delta; +} + +/* + * a1 = a0 * e + a * (1 - e) + */ +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + load += 1UL << (FSHIFT - 1); + return load >> FSHIFT; +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. + * + * When making the ILB scale, we should try to pull this in as well. + */ +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; + +static inline int calc_load_write_idx(void) +{ + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ + return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ + struct rq *this_rq = this_rq(); + long delta; + + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ + delta = calc_load_fold_active(this_rq); + if (delta) { + int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); + } +} + +void calc_load_exit_idle(void) +{ + struct rq *this_rq = this_rq(); + + /* + * If we're still before the sample window, we're done. + */ + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + /* + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. + */ + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ + int idx = calc_load_read_idx(); + long delta = 0; + + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); + + return delta; +} + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(void) +{ + long delta, active, n; + + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + + calc_load_update += n * LOAD_FREQ; + } + + /* + * Flip the idle index... + * + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. + */ + smp_wmb(); + calc_load_idx++; +} +#else /* !CONFIG_NO_HZ_COMMON */ + +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } + +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(unsigned long ticks) +{ + long active, delta; + + if (time_before(jiffies, calc_load_update + 10)) + return; + + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; + + /* + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. + */ + calc_global_nohz(); +} + +/* + * Called from update_cpu_load() to periodically update this CPU's + * active count. + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long delta; + + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + delta = calc_load_fold_active(this_rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + this_rq->calc_load_update += LOAD_FREQ; +} + +/* + * End of global load-average stuff + */ + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) +{ + int i, scale; + + this_rq->nr_load_updates++; + + /* Update our load: */ + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; + } + + sched_avg_update(this_rq); +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long load = this_rq->load.weight; + unsigned long pending_updates; + + /* + * bail if there's load or we're actually up-to-date. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* + * Called from scheduler_tick() + */ +void update_cpu_load_active(struct rq *this_rq) +{ + /* + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, this_rq->load.weight, 1); + + calc_load_account_active(this_rq); +} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ce39224d6155..a38ee0a0650e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -10,8 +10,16 @@ #include "cpupri.h" #include "cpuacct.h" +struct rq; + extern __read_mostly int scheduler_running; +extern unsigned long calc_load_update; +extern atomic_long_t calc_load_tasks; + +extern long calc_load_fold_active(struct rq *this_rq); +extern void update_cpu_load_active(struct rq *this_rq); + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], From 8527632dc95472adb571701e852479531c0567a2 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 19 Apr 2013 15:10:50 -0400 Subject: [PATCH 02/42] sched: Move update_load_*() methods from sched.h to fair.c These inlines are only used by kernel/sched/fair.c so they do not need to be present in the main kernel/sched/sched.h file. Signed-off-by: Paul Gortmaker Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1366398650-31599-3-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 18 ++++++++++++++++++ kernel/sched/sched.h | 18 ------------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c61a614465c8..08a554dd3e90 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -113,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + /* * Increase the granularity value when there are more CPUs, * because with more CPUs the 'effective latency' as visible diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a38ee0a0650e..f1f6256c1224 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -892,24 +892,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #define WF_FORK 0x02 /* child wakeup after fork */ #define WF_MIGRATED 0x4 /* internal use, task got migrated */ -static inline void update_load_add(struct load_weight *lw, unsigned long inc) -{ - lw->weight += inc; - lw->inv_weight = 0; -} - -static inline void update_load_sub(struct load_weight *lw, unsigned long dec) -{ - lw->weight -= dec; - lw->inv_weight = 0; -} - -static inline void update_load_set(struct load_weight *lw, unsigned long w) -{ - lw->weight = w; - lw->inv_weight = 0; -} - /* * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that From 424c93fe4cbe719e7fd7169248d2b648c493b68d Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Thu, 9 May 2013 11:24:03 -0500 Subject: [PATCH 03/42] sched: Use this_rq() helper It is a few instructions more efficent to and slightly more readable to use this_rq()-> instead of cpu_rq(smp_processor_id())-> . Size comparison of kernel/sched/fair.o: text data bss dec hex filename 27972 122 26 28120 6dd8 fair.o.before 27956 122 26 28104 6dc8 fair.o.after Signed-off-by: Nathan Zimmer Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1368116643-87971-1-git-send-email-nzimmer@sgi.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++---- kernel/sched/rt.c | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c61a614465c8..f2c9c0c3406c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5418,10 +5418,9 @@ static inline void nohz_balance_exit_idle(int cpu) static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; - int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + sd = rcu_dereference_check_sched_domain(this_rq()->sd); if (!sd || !sd->nohz_idle) goto unlock; @@ -5436,10 +5435,9 @@ unlock: void set_cpu_sd_state_idle(void) { struct sched_domain *sd; - int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + sd = rcu_dereference_check_sched_domain(this_rq()->sd); if (!sd || sd->nohz_idle) goto unlock; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 127a2c4cf4ab..7aced2e3b085 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -472,7 +472,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) #ifdef CONFIG_SMP static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_rq(smp_processor_id())->rd->span; + return this_rq()->rd->span; } #else static inline const struct cpumask *sched_rt_period_mask(void) From 41261b6a832ea0e788627f6a8707854423f9ff49 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Fri, 24 May 2013 18:07:49 +0200 Subject: [PATCH 04/42] sched/autogroup: Fix race with task_groups list In autogroup_create(), a tg is allocated and added to the task_groups list. If CONFIG_RT_GROUP_SCHED is set, this tg is then modified while on the list, without locking. This can race with someone walking the list, like __enable_runtime() during CPU unplug, and result in a use-after-free bug. To fix this, move sched_online_group(), which adds the tg to the list, to the end of the autogroup_create() function after the modification. Signed-off-by: Gerald Schaefer Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1369411669-46971-2-git-send-email-gerald.schaefer@de.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/auto_group.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 64de5f8b0c9e..4a073539c58e 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -77,8 +77,6 @@ static inline struct autogroup *autogroup_create(void) if (IS_ERR(tg)) goto out_free; - sched_online_group(tg, &root_task_group); - kref_init(&ag->kref); init_rwsem(&ag->lock); ag->id = atomic_inc_return(&autogroup_seq_nr); @@ -98,6 +96,7 @@ static inline struct autogroup *autogroup_create(void) #endif tg->autogroup = ag; + sched_online_group(tg, &root_task_group); return ag; out_free: From c5405a495e88d93cf9b4f4cc91507c7f4afcb901 Mon Sep 17 00:00:00 2001 From: Neil Zhang Date: Thu, 11 Apr 2013 21:04:59 +0800 Subject: [PATCH 05/42] sched: Remove redundant update_runtime notifier migration_call() will do all the things that update_runtime() does. So let's remove it. Furthermore, there is potential risk that the current code will catch BUG_ON at line 689 of rt.c when do cpu hotplug while there are realtime threads running because of enabling runtime twice while the rt_runtime may already changed. Signed-off-by: Neil Zhang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365685499-26515-1-git-send-email-zhangwm@marvell.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 --- kernel/sched/rt.c | 40 ---------------------------------------- kernel/sched/sched.h | 1 - 3 files changed, 44 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bfa7e77e0b50..79e48e6a9385 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6285,9 +6285,6 @@ void __init sched_init_smp(void) hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); - /* RT runtime code needs to handle some hotplug events */ - hotcpu_notifier(update_runtime, 0); - init_hrtick(); /* Move init over to a non-isolated CPU */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7aced2e3b085..8853ab17b750 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -699,15 +699,6 @@ balanced: } } -static void disable_runtime(struct rq *rq) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __disable_runtime(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - static void __enable_runtime(struct rq *rq) { rt_rq_iter_t iter; @@ -732,37 +723,6 @@ static void __enable_runtime(struct rq *rq) } } -static void enable_runtime(struct rq *rq) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __enable_runtime(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - disable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - enable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - static int balance_runtime(struct rt_rq *rt_rq) { int more = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f1f6256c1224..c806c61a1261 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1041,7 +1041,6 @@ static inline void idle_balance(int cpu, struct rq *rq) extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); -extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); From 77bd39702f0b3840cea17681409270b16a3b93c0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 01:50:58 +0200 Subject: [PATCH 06/42] sched: Update rq clock before migrating tasks out of dying CPU Because the sched_class::put_prev_task() callback of rt and fair classes are referring to the rq clock to update their runtime statistics. There is a missing rq clock update from the CPU hotplug notifier's entry point of the scheduler. Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Steven Rostedt Cc: Paul Turner Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365724262-20142-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 79e48e6a9385..7bf0418dc60f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4378,6 +4378,13 @@ static void migrate_tasks(unsigned int dead_cpu) */ rq->stop = NULL; + /* + * put_prev_task() and pick_next_task() sched + * class method both need to have an up-to-date + * value of rq->clock[_task] + */ + update_rq_clock(rq); + for ( ; ; ) { /* * There's this thread running, bail when that's the only From 71b1da46ff70309a2ec12ce943aa0d192d2c8f0c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 01:50:59 +0200 Subject: [PATCH 07/42] sched: Update rq clock before setting fair group shares Because we may update the execution time in sched_group_set_shares()->update_cfs_shares()->reweight_entity()->update_curr() before reweighting the entity while setting the group shares and this requires an uptodate version of the runqueue clock. Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Steven Rostedt Cc: Paul Turner Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365724262-20142-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f62b16dfba63..f76ca21711bb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6107,6 +6107,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) se = tg->se[i]; /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); + + /* Possible calls to update_curr() need rq clock */ + update_rq_clock(rq); for_each_sched_entity(se) update_cfs_shares(group_cfs_rq(se)); raw_spin_unlock_irqrestore(&rq->lock, flags); From 1ad4ec0dc740c4183acd6d6e367ca52b28e4fa94 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 01:51:00 +0200 Subject: [PATCH 08/42] sched: Update rq clock before calling check_preempt_curr() check_preempt_curr() of fair class needs an uptodate sched clock value to update runtime stats of the current task of the target's rq. When a task is woken up, activate_task() is usually called right before ttwu_do_wakeup() unless the task is still in the runqueue. In the latter case we need to update the rq clock explicitly because activate_task() isn't here to do the job for us. Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Steven Rostedt Cc: Paul Turner Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365724262-20142-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7bf0418dc60f..46d00172ae4a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1365,6 +1365,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) rq = __task_rq_lock(p); if (p->on_rq) { + /* check_preempt_curr() may use rq clock */ + update_rq_clock(rq); ttwu_do_wakeup(rq, p, wake_flags); ret = 1; } From 1a55af2e45cce0ff13bc33c8ee99da84e188b615 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 01:51:01 +0200 Subject: [PATCH 09/42] sched: Update rq clock earlier in unthrottle_cfs_rq In this function we are making use of rq->clock right before the update of the rq clock, let's just call update_rq_clock() just before that to avoid using a stale rq clock value. Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Steven Rostedt Cc: Paul Turner Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365724262-20142-5-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f76ca21711bb..1c8762a5370c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2319,12 +2319,14 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; cfs_rq->throttled = 0; + + update_rq_clock(rq); + raw_spin_lock(&cfs_b->lock); cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); - update_rq_clock(rq); /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); From 78becc27097585c6aec7043834cadde950ae79f2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 01:51:02 +0200 Subject: [PATCH 10/42] sched: Use an accessor to read the rq clock Read the runqueue clock through an accessor. This prepares for adding a debugging infrastructure to detect missing or redundant calls to update_rq_clock() between a scheduler's entry and exit point. Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Steven Rostedt Cc: Paul Turner Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365724262-20142-6-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/fair.c | 44 ++++++++++++++++++++-------------------- kernel/sched/rt.c | 8 ++++---- kernel/sched/sched.h | 10 +++++++++ kernel/sched/stats.h | 8 ++++---- kernel/sched/stop_task.c | 8 ++++---- 6 files changed, 47 insertions(+), 37 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 46d00172ae4a..36f85be2932b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -667,7 +667,7 @@ void sched_avg_update(struct rq *rq) { s64 period = sched_avg_period(); - while ((s64)(rq->clock - rq->age_stamp) > period) { + while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { /* * Inline assembly required to prevent the compiler * optimising this loop into a divmod call. @@ -1328,7 +1328,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) p->sched_class->task_woken(rq, p); if (rq->idle_stamp) { - u64 delta = rq->clock - rq->idle_stamp; + u64 delta = rq_clock(rq) - rq->idle_stamp; u64 max = 2*sysctl_sched_migration_cost; if (delta > max) @@ -2106,7 +2106,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) if (task_current(rq, p)) { update_rq_clock(rq); - ns = rq->clock_task - p->se.exec_start; + ns = rq_clock_task(rq) - p->se.exec_start; if ((s64)ns < 0) ns = 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1c8762a5370c..3ee1c2e4ae60 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -704,7 +704,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock_task; + u64 now = rq_clock_task(rq_of(cfs_rq)); unsigned long delta_exec; if (unlikely(!curr)) @@ -736,7 +736,7 @@ static void update_curr(struct cfs_rq *cfs_rq) static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); + schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); } /* @@ -756,14 +756,14 @@ static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_of(cfs_rq)->clock - se->statistics.wait_start)); + rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_of(cfs_rq)->clock - se->statistics.wait_start); + rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { trace_sched_stat_wait(task_of(se), - rq_of(cfs_rq)->clock - se->statistics.wait_start); + rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); } #endif schedstat_set(se->statistics.wait_start, 0); @@ -789,7 +789,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * We are starting a new run period: */ - se->exec_start = rq_of(cfs_rq)->clock_task; + se->exec_start = rq_clock_task(rq_of(cfs_rq)); } /************************************************** @@ -1515,7 +1515,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); + __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } @@ -1530,7 +1530,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, * accumulated while sleeping. */ if (unlikely(se->avg.decay_count <= 0)) { - se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; + se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); if (se->avg.decay_count) { /* * In a wake-up migration we have to approximate the @@ -1625,7 +1625,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) tsk = task_of(se); if (se->statistics.sleep_start) { - u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; if ((s64)delta < 0) delta = 0; @@ -1642,7 +1642,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) } } if (se->statistics.block_start) { - u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; if ((s64)delta < 0) delta = 0; @@ -1823,9 +1823,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_of(cfs_rq)->clock; + se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_of(cfs_rq)->clock; + se->statistics.block_start = rq_clock(rq_of(cfs_rq)); } #endif } @@ -2100,7 +2100,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) if (unlikely(cfs_rq->throttle_count)) return cfs_rq->throttled_clock_task; - return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; + return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; } /* returns 0 on failure to allocate runtime */ @@ -2159,7 +2159,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) + if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) return; if (cfs_rq->runtime_remaining < 0) @@ -2248,7 +2248,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) #ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { /* adjust cfs_rq_clock_task() */ - cfs_rq->throttled_clock_task_time += rq->clock_task - + cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; } #endif @@ -2263,7 +2263,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) - cfs_rq->throttled_clock_task = rq->clock_task; + cfs_rq->throttled_clock_task = rq_clock_task(rq); cfs_rq->throttle_count++; return 0; @@ -2302,7 +2302,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rq->nr_running -= task_delta; cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq->clock; + cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); raw_spin_unlock(&cfs_b->lock); @@ -2323,7 +2323,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) update_rq_clock(rq); raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; + cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); @@ -2726,7 +2726,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) #else /* CONFIG_CFS_BANDWIDTH */ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { - return rq_of(cfs_rq)->clock_task; + return rq_clock_task(rq_of(cfs_rq)); } static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, @@ -3966,7 +3966,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); + tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { @@ -4322,7 +4322,7 @@ static unsigned long scale_rt_power(int cpu) age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); - total = sched_avg_period() + (rq->clock - age_stamp); + total = sched_avg_period() + (rq_clock(rq) - age_stamp); if (unlikely(total < avg)) { /* Ensures that power won't end up being negative */ @@ -5261,7 +5261,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) int pulled_task = 0; unsigned long next_balance = jiffies + HZ; - this_rq->idle_stamp = this_rq->clock; + this_rq->idle_stamp = rq_clock(this_rq); if (this_rq->avg_idle < sysctl_sched_migration_cost) return; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8853ab17b750..8d85f9ac4262 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -886,7 +886,7 @@ static void update_curr_rt(struct rq *rq) if (curr->sched_class != &rt_sched_class) return; - delta_exec = rq->clock_task - curr->se.exec_start; + delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; @@ -896,7 +896,7 @@ static void update_curr_rt(struct rq *rq) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq->clock_task; + curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); @@ -1345,7 +1345,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } while (rt_rq); p = rt_task_of(rt_se); - p->se.exec_start = rq->clock_task; + p->se.exec_start = rq_clock_task(rq); return p; } @@ -1997,7 +1997,7 @@ static void set_curr_task_rt(struct rq *rq) { struct task_struct *p = rq->curr; - p->se.exec_start = rq->clock_task; + p->se.exec_start = rq_clock_task(rq); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c806c61a1261..74ff659e964f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -548,6 +548,16 @@ DECLARE_PER_CPU(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() (&__raw_get_cpu_var(runqueues)) +static inline u64 rq_clock(struct rq *rq) +{ + return rq->clock; +} + +static inline u64 rq_clock_task(struct rq *rq) +{ + return rq->clock_task; +} + #ifdef CONFIG_SMP #define rcu_dereference_check_sched_domain(p) \ diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 2ef90a51ec5e..17d7065c3872 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -61,7 +61,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) */ static inline void sched_info_dequeued(struct task_struct *t) { - unsigned long long now = task_rq(t)->clock, delta = 0; + unsigned long long now = rq_clock(task_rq(t)), delta = 0; if (unlikely(sched_info_on())) if (t->sched_info.last_queued) @@ -79,7 +79,7 @@ static inline void sched_info_dequeued(struct task_struct *t) */ static void sched_info_arrive(struct task_struct *t) { - unsigned long long now = task_rq(t)->clock, delta = 0; + unsigned long long now = rq_clock(task_rq(t)), delta = 0; if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; @@ -100,7 +100,7 @@ static inline void sched_info_queued(struct task_struct *t) { if (unlikely(sched_info_on())) if (!t->sched_info.last_queued) - t->sched_info.last_queued = task_rq(t)->clock; + t->sched_info.last_queued = rq_clock(task_rq(t)); } /* @@ -112,7 +112,7 @@ static inline void sched_info_queued(struct task_struct *t) */ static inline void sched_info_depart(struct task_struct *t) { - unsigned long long delta = task_rq(t)->clock - + unsigned long long delta = rq_clock(task_rq(t)) - t->sched_info.last_arrival; rq_sched_info_depart(task_rq(t), delta); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index da5eb5bed84a..e08fbeeb54b9 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -28,7 +28,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) struct task_struct *stop = rq->stop; if (stop && stop->on_rq) { - stop->se.exec_start = rq->clock_task; + stop->se.exec_start = rq_clock_task(rq); return stop; } @@ -57,7 +57,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) struct task_struct *curr = rq->curr; u64 delta_exec; - delta_exec = rq->clock_task - curr->se.exec_start; + delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec < 0)) delta_exec = 0; @@ -67,7 +67,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq->clock_task; + curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); } @@ -79,7 +79,7 @@ static void set_curr_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - stop->se.exec_start = rq->clock_task; + stop->se.exec_start = rq_clock_task(rq); } static void switched_to_stop(struct rq *rq, struct task_struct *p) From 84f9f3a15611536537d59060818a2354d5039ff3 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 2 May 2013 15:34:33 +0200 Subject: [PATCH 11/42] sched: Use swap() macro in scale_stime() Simple cleanup. Reported-by: Peter Zijlstra Signed-off-by: Stanislaw Gruszka Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1367501673-6563-1-git-send-email-sgruszka@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cc2dc3eea8a3..94691bcd7364 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -515,9 +515,8 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) for (;;) { /* Make sure "rtime" is the bigger of stime/rtime */ - if (stime > rtime) { - u64 tmp = rtime; rtime = stime; stime = tmp; - } + if (stime > rtime) + swap(rtime, stime); /* Make sure 'total' fits in 32 bits */ if (total >> 32) From 0de358f1c2642710d41190b73fbc295e675c4ab8 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Thu, 30 May 2013 14:34:20 +0530 Subject: [PATCH 12/42] sched/fair: Remove unused variable from expire_cfs_rq_runtime() Commit 78becc2709 ("sched: Use an accessor to read the rq clock") introduces rq_clock(), which obsoletes the use of the "rq" variable in expire_cfs_rq_runtime() and triggers this build warning: kernel/sched/fair.c: In function 'expire_cfs_rq_runtime': kernel/sched/fair.c:2159:13: warning: unused variable 'rq' [-Wunused-variable] Signed-off-by: Kamalesh Babulal Acked-by: Frederic Weisbecker Acked-by: Paul Turner Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1369904660-14169-1-git-send-email-kamalesh@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3ee1c2e4ae60..143dcdbc47af 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2156,7 +2156,6 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct rq *rq = rq_of(cfs_rq); /* if the deadline is ahead of our clock, nothing to do */ if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) From e23ee74777f389369431d77390c4b09332ce026a Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 7 Jun 2013 15:37:43 -0400 Subject: [PATCH 13/42] sched/rt: Simplify pull_rt_task() logic and remove .leaf_rt_rq_list [ Peter, this is based off of some of my work, I ran it though a few tests and it passed. I also reviewed it, and added my SOB as I am somewhat a co-author to it. ] Based on the patch by Steven Rostedt from previous year: https://lkml.org/lkml/2012/4/18/517 1)Simplify pull_rt_task() logic: search in pushable tasks of dest runqueue. The only pullable tasks are the tasks which are pushable in their local rq, and no others. 2)Remove .leaf_rt_rq_list member of struct rt_rq and functions connected with it: nobody uses it since now. Signed-off-by: Kirill Tkhai Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/287571370557898@web7d.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 82 +++++++++----------------------------------- kernel/sched/sched.h | 1 - 2 files changed, 16 insertions(+), 67 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8d85f9ac4262..01970c8e64df 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -399,20 +399,6 @@ static inline struct task_group *next_task_group(struct task_group *tg) (iter = next_task_group(iter)) && \ (rt_rq = iter->rt_rq[cpu_of(rq)]);) -static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) -{ - list_add_rcu(&rt_rq->leaf_rt_rq_list, - &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); -} - -static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) -{ - list_del_rcu(&rt_rq->leaf_rt_rq_list); -} - -#define for_each_leaf_rt_rq(rt_rq, rq) \ - list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) - #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = rt_se->parent) @@ -509,17 +495,6 @@ typedef struct rt_rq *rt_rq_iter_t; #define for_each_rt_rq(rt_rq, iter, rq) \ for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) -static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) -{ -} - -static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) -{ -} - -#define for_each_leaf_rt_rq(rt_rq, rq) \ - for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) - #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = NULL) @@ -1066,9 +1041,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) return; - if (!rt_rq->rt_nr_running) - list_add_leaf_rt_rq(rt_rq); - if (head) list_add(&rt_se->run_list, queue); else @@ -1088,8 +1060,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); - if (!rt_rq->rt_nr_running) - list_del_leaf_rt_rq(rt_rq); } /* @@ -1394,42 +1364,24 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) return 0; } -/* Return the second highest RT task, NULL otherwise */ -static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) +/* + * Return the highest pushable rq's task, which is suitable to be executed + * on the cpu, NULL otherwise + */ +static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) { - struct task_struct *next = NULL; - struct sched_rt_entity *rt_se; - struct rt_prio_array *array; - struct rt_rq *rt_rq; - int idx; + struct plist_head *head = &rq->rt.pushable_tasks; + struct task_struct *p; - for_each_leaf_rt_rq(rt_rq, rq) { - array = &rt_rq->active; - idx = sched_find_first_bit(array->bitmap); -next_idx: - if (idx >= MAX_RT_PRIO) - continue; - if (next && next->prio <= idx) - continue; - list_for_each_entry(rt_se, array->queue + idx, run_list) { - struct task_struct *p; + if (!has_pushable_tasks(rq)) + return NULL; - if (!rt_entity_is_task(rt_se)) - continue; - - p = rt_task_of(rt_se); - if (pick_rt_task(rq, p, cpu)) { - next = p; - break; - } - } - if (!next) { - idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); - goto next_idx; - } + plist_for_each_entry(p, head, pushable_tasks) { + if (pick_rt_task(rq, p, cpu)) + return p; } - return next; + return NULL; } static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); @@ -1703,12 +1655,10 @@ static int pull_rt_task(struct rq *this_rq) double_lock_balance(this_rq, src_rq); /* - * Are there still pullable RT tasks? + * We can pull only a task, which is pushable + * on its rq, and no others. */ - if (src_rq->rt.rt_nr_running <= 1) - goto skip; - - p = pick_next_highest_task_rt(src_rq, this_cpu); + p = pick_highest_pushable_task(src_rq, this_cpu); /* * Do we have an RT task that preempts diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 74ff659e964f..029601a61587 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -361,7 +361,6 @@ struct rt_rq { unsigned long rt_nr_boosted; struct rq *rq; - struct list_head leaf_rt_rq_list; struct task_group *tg; #endif }; From 22b958d8cc5127d22d2ad2141277d312d93fad6c Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 4 Jun 2013 14:23:39 +0800 Subject: [PATCH 14/42] sched: Refine the code in unthrottle_cfs_rq() Directly use rq to save some code. Signed-off-by: Michael Wang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/51AD87EB.1070605@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 143dcdbc47af..47a30be1fe83 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2315,7 +2315,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) int enqueue = 1; long task_delta; - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + se = cfs_rq->tg->se[cpu_of(rq)]; cfs_rq->throttled = 0; From 8404c90d050733b3404dc36c500f63ccb0c972ce Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 4 Jun 2013 14:24:08 +0800 Subject: [PATCH 15/42] sched: Femove the useless declaration in kernel/sched/fair.c default_cfs_period(), do_sched_cfs_period_timer(), do_sched_cfs_slack_timer() already defined previously, no need to declare again. Signed-off-by: Michael Wang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/51AD8808.7020608@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 47a30be1fe83..c0ac2c3b56e1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2618,10 +2618,6 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) throttle_cfs_rq(cfs_rq); } -static inline u64 default_cfs_period(void); -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); -static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); - static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = From 0a0fca9d832b704f116a25badd1ca8c16771dcac Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 4 Jun 2013 13:10:24 +0530 Subject: [PATCH 16/42] sched: Rename sched.c as sched/core.c in comments and Documentation Most of the stuff from kernel/sched.c was moved to kernel/sched/core.c long time back and the comments/Documentation never got updated. I figured it out when I was going through sched-domains.txt and so thought of fixing it globally. I haven't crossed check if the stuff that is referenced in sched/core.c by all these files is still present and hasn't changed as that wasn't the motive behind this patch. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/cdff76a265326ab8d71922a1db5be599f20aad45.1370329560.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- Documentation/cgroups/cpusets.txt | 2 +- Documentation/rt-mutex-design.txt | 2 +- Documentation/scheduler/sched-domains.txt | 4 ++-- Documentation/spinlocks.txt | 2 +- Documentation/virtual/uml/UserModeLinux-HOWTO.txt | 4 ++-- arch/avr32/kernel/process.c | 2 +- arch/cris/include/arch-v10/arch/bitops.h | 2 +- arch/ia64/kernel/head.S | 2 +- arch/mips/kernel/mips-mt-fpaff.c | 4 ++-- arch/mips/kernel/scall32-o32.S | 5 +++-- arch/powerpc/include/asm/mmu_context.h | 2 +- arch/tile/include/asm/processor.h | 2 +- arch/tile/kernel/stack.c | 2 +- arch/um/kernel/sysrq.c | 2 +- include/linux/completion.h | 2 +- include/linux/perf_event.h | 2 +- include/linux/spinlock_up.h | 2 +- include/uapi/asm-generic/unistd.h | 2 +- kernel/cpuset.c | 4 ++-- kernel/time.c | 2 +- kernel/workqueue_internal.h | 2 +- 21 files changed, 27 insertions(+), 26 deletions(-) diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt index 12e01d432bfe..7740038d82bc 100644 --- a/Documentation/cgroups/cpusets.txt +++ b/Documentation/cgroups/cpusets.txt @@ -373,7 +373,7 @@ can become very uneven. 1.7 What is sched_load_balance ? -------------------------------- -The kernel scheduler (kernel/sched.c) automatically load balances +The kernel scheduler (kernel/sched/core.c) automatically load balances tasks. If one CPU is underutilized, kernel code running on that CPU will look for tasks on other more overloaded CPUs and move those tasks to itself, within the constraints of such placement mechanisms diff --git a/Documentation/rt-mutex-design.txt b/Documentation/rt-mutex-design.txt index 33ed8007a845..a5bcd7f5c33f 100644 --- a/Documentation/rt-mutex-design.txt +++ b/Documentation/rt-mutex-design.txt @@ -384,7 +384,7 @@ priority back. __rt_mutex_adjust_prio examines the result of rt_mutex_getprio, and if the result does not equal the task's current priority, then rt_mutex_setprio is called to adjust the priority of the task to the new priority. -Note that rt_mutex_setprio is defined in kernel/sched.c to implement the +Note that rt_mutex_setprio is defined in kernel/sched/core.c to implement the actual change in priority. It is interesting to note that __rt_mutex_adjust_prio can either increase diff --git a/Documentation/scheduler/sched-domains.txt b/Documentation/scheduler/sched-domains.txt index 443f0c76bab4..4af80b1c05aa 100644 --- a/Documentation/scheduler/sched-domains.txt +++ b/Documentation/scheduler/sched-domains.txt @@ -25,7 +25,7 @@ is treated as one entity. The load of a group is defined as the sum of the load of each of its member CPUs, and only when the load of a group becomes out of balance are tasks moved between groups. -In kernel/sched.c, trigger_load_balance() is run periodically on each CPU +In kernel/sched/core.c, trigger_load_balance() is run periodically on each CPU through scheduler_tick(). It raises a softirq after the next regularly scheduled rebalancing event for the current runqueue has arrived. The actual load balancing workhorse, run_rebalance_domains()->rebalance_domains(), is then run @@ -62,7 +62,7 @@ struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of the specifics and what to tune. Architectures may retain the regular override the default SD_*_INIT flags -while using the generic domain builder in kernel/sched.c if they wish to +while using the generic domain builder in kernel/sched/core.c if they wish to retain the traditional SMT->SMP->NUMA topology (or some subset of that). This can be done by #define'ing ARCH_HASH_SCHED_TUNE. diff --git a/Documentation/spinlocks.txt b/Documentation/spinlocks.txt index 9dbe885ecd8d..97eaf5727178 100644 --- a/Documentation/spinlocks.txt +++ b/Documentation/spinlocks.txt @@ -137,7 +137,7 @@ don't block on each other (and thus there is no dead-lock wrt interrupts. But when you do the write-lock, you have to use the irq-safe version. For an example of being clever with rw-locks, see the "waitqueue_lock" -handling in kernel/sched.c - nothing ever _changes_ a wait-queue from +handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from within an interrupt, they only read the queue in order to know whom to wake up. So read-locks are safe (which is good: they are very common indeed), while write-locks need to protect themselves against interrupts. diff --git a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt index a5f8436753e7..f4099ca6b483 100644 --- a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt +++ b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt @@ -3127,7 +3127,7 @@ at process_kern.c:156 #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) at process_kern.c:161 - #4 0x10001d12 in schedule () at sched.c:777 + #4 0x10001d12 in schedule () at core.c:777 #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 #6 0x1006aa10 in __down_failed () at semaphore.c:157 #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 @@ -3191,7 +3191,7 @@ at process_kern.c:161 161 _switch_to(prev, next); (gdb) - #4 0x10001d12 in schedule () at sched.c:777 + #4 0x10001d12 in schedule () at core.c:777 777 switch_to(prev, next, prev); (gdb) #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index e7b61494c312..c2731003edef 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -341,7 +341,7 @@ unsigned long get_wchan(struct task_struct *p) * is actually quite ugly. It might be possible to * determine the frame size automatically at build * time by doing this: - * - compile sched.c + * - compile sched/core.c * - disassemble the resulting sched.o * - look for 'sub sp,??' shortly after ':' */ diff --git a/arch/cris/include/arch-v10/arch/bitops.h b/arch/cris/include/arch-v10/arch/bitops.h index be85f6de25d3..03d9cfd92c8a 100644 --- a/arch/cris/include/arch-v10/arch/bitops.h +++ b/arch/cris/include/arch-v10/arch/bitops.h @@ -17,7 +17,7 @@ static inline unsigned long cris_swapnwbrlz(unsigned long w) in another register: ! __asm__ ("swapnwbr %2\n\tlz %2,%0" ! : "=r,r" (res), "=r,X" (dummy) : "1,0" (w)); - confuses gcc (sched.c, gcc from cris-dist-1.14). */ + confuses gcc (core.c, gcc from cris-dist-1.14). */ unsigned long res; __asm__ ("swapnwbr %0 \n\t" diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S index 9be4e497f3d3..991ca336b8a2 100644 --- a/arch/ia64/kernel/head.S +++ b/arch/ia64/kernel/head.S @@ -1035,7 +1035,7 @@ END(ia64_delay_loop) * Return a CPU-local timestamp in nano-seconds. This timestamp is * NOT synchronized across CPUs its return value must never be * compared against the values returned on another CPU. The usage in - * kernel/sched.c ensures that. + * kernel/sched/core.c ensures that. * * The return-value of sched_clock() is NOT supposed to wrap-around. * If it did, it would cause some scheduling hiccups (at the worst). diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c index fd814e08c945..cb098628aee8 100644 --- a/arch/mips/kernel/mips-mt-fpaff.c +++ b/arch/mips/kernel/mips-mt-fpaff.c @@ -27,12 +27,12 @@ unsigned long mt_fpemul_threshold; * FPU affinity with the user's requested processor affinity. * This code is 98% identical with the sys_sched_setaffinity() * and sys_sched_getaffinity() system calls, and should be - * updated when kernel/sched.c changes. + * updated when kernel/sched/core.c changes. */ /* * find_process_by_pid - find a process with a matching PID value. - * used in sys_sched_set/getaffinity() in kernel/sched.c, so + * used in sys_sched_set/getaffinity() in kernel/sched/core.c, so * cloned here. */ static inline struct task_struct *find_process_by_pid(pid_t pid) diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 9b36424b03c5..e9127ec612ef 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -476,8 +476,9 @@ einval: li v0, -ENOSYS /* * For FPU affinity scheduling on MIPS MT processors, we need to * intercept sys_sched_xxxaffinity() calls until we get a proper hook - * in kernel/sched.c. Considered only temporary we only support these - * hooks for the 32-bit kernel - there is no MIPS64 MT processor atm. + * in kernel/sched/core.c. Considered only temporary we only support + * these hooks for the 32-bit kernel - there is no MIPS64 MT processor + * atm. */ sys mipsmt_sys_sched_setaffinity 3 sys mipsmt_sys_sched_getaffinity 3 diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index a73668a5f30d..b467530e2485 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -38,7 +38,7 @@ extern void drop_cop(unsigned long acop, struct mm_struct *mm); /* * switch_mm is the entry point called from the architecture independent - * code in kernel/sched.c + * code in kernel/sched/core.c */ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h index 2b70dfb1442e..b3f104953da2 100644 --- a/arch/tile/include/asm/processor.h +++ b/arch/tile/include/asm/processor.h @@ -225,7 +225,7 @@ extern int do_work_pending(struct pt_regs *regs, u32 flags); /* * Return saved (kernel) PC of a blocked thread. - * Only used in a printk() in kernel/sched.c, so don't work too hard. + * Only used in a printk() in kernel/sched/core.c, so don't work too hard. */ #define thread_saved_pc(t) ((t)->thread.pc) diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c index ed258b8ae320..af8dfc9665f6 100644 --- a/arch/tile/kernel/stack.c +++ b/arch/tile/kernel/stack.c @@ -442,7 +442,7 @@ void _KBacktraceIterator_init_current(struct KBacktraceIterator *kbt, ulong pc, regs_to_pt_regs(®s, pc, lr, sp, r52)); } -/* This is called only from kernel/sched.c, with esp == NULL */ +/* This is called only from kernel/sched/core.c, with esp == NULL */ void show_stack(struct task_struct *task, unsigned long *esp) { struct KBacktraceIterator kbt; diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 7d101a2a1541..0dc4d1c6f98a 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -39,7 +39,7 @@ void show_trace(struct task_struct *task, unsigned long * stack) static const int kstack_depth_to_print = 24; /* This recently started being used in arch-independent code too, as in - * kernel/sched.c.*/ + * kernel/sched/core.c.*/ void show_stack(struct task_struct *task, unsigned long *esp) { unsigned long *stack; diff --git a/include/linux/completion.h b/include/linux/completion.h index 33f0280fd533..3cd574d5b19e 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -5,7 +5,7 @@ * (C) Copyright 2001 Linus Torvalds * * Atomic wait-for-completion handler data structures. - * See kernel/sched.c for details. + * See kernel/sched/core.c for details. */ #include diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f463a46424e2..5ec99e5a50d2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -803,7 +803,7 @@ static inline void perf_restore_debug_store(void) { } #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) /* - * This has to have a higher priority than migration_notifier in sched.c. + * This has to have a higher priority than migration_notifier in sched/core.c. */ #define perf_cpu_notifier(fn) \ do { \ diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index e2369c167dbd..8b3ac0d718eb 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -67,7 +67,7 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) #else /* DEBUG_SPINLOCK */ #define arch_spin_is_locked(lock) ((void)(lock), 0) -/* for sched.c and kernel_lock.c: */ +/* for sched/core.c and kernel_lock.c: */ # define arch_spin_lock(lock) do { barrier(); (void)(lock); } while (0) # define arch_spin_lock_flags(lock, flags) do { barrier(); (void)(lock); } while (0) # define arch_spin_unlock(lock) do { barrier(); (void)(lock); } while (0) diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 0cc74c4403e4..a20a9b4d3871 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -361,7 +361,7 @@ __SYSCALL(__NR_syslog, sys_syslog) #define __NR_ptrace 117 __SYSCALL(__NR_ptrace, sys_ptrace) -/* kernel/sched.c */ +/* kernel/sched/core.c */ #define __NR_sched_setparam 118 __SYSCALL(__NR_sched_setparam, sys_sched_setparam) #define __NR_sched_setscheduler 119 diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64b3f791bbe5..902d13fc2b13 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -540,7 +540,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, * This function builds a partial partition of the systems CPUs * A 'partial partition' is a set of non-overlapping subsets whose * union is a subset of that set. - * The output of this function needs to be passed to kernel/sched.c + * The output of this function needs to be passed to kernel/sched/core.c * partition_sched_domains() routine, which will rebuild the scheduler's * load balancing domains (sched domains) as specified by that partial * partition. @@ -569,7 +569,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, * is a subset of one of these domains, while there are as * many such domains as possible, each as small as possible. * doms - Conversion of 'csa' to an array of cpumasks, for passing to - * the kernel/sched.c routine partition_sched_domains() in a + * the kernel/sched/core.c routine partition_sched_domains() in a * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) diff --git a/kernel/time.c b/kernel/time.c index d3617dbd3dca..7c7964c33ae7 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -11,7 +11,7 @@ * Modification history kernel/time.c * * 1993-09-02 Philip Gladstone - * Created file with time related functions from sched.c and adjtimex() + * Created file with time related functions from sched/core.c and adjtimex() * 1993-10-08 Torsten Duwe * adjtime interface update and CMOS clock write code * 1995-08-13 Torsten Duwe diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index ad83c96b2ece..7e2204db0b1a 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -64,7 +64,7 @@ static inline struct worker *current_wq_worker(void) /* * Scheduler hooks for concurrency managed workqueue. Only to be used from - * sched.c and workqueue.c. + * sched/core.c and workqueue.c. */ void wq_worker_waking_up(struct task_struct *task, int cpu); struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); From 4a850cbefa9592ddde3670a41c10c9576a657c43 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 4 Jun 2013 16:12:43 +0530 Subject: [PATCH 17/42] sched: Remove unused params of build_sched_domain() build_sched_domain() never uses parameter struct s_data *d and so passing it is useless. Remove it. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/545e0b4536166a15b4475abcafe5ed0db4ad4a2c.1370436120.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8f071cc9f51..342e74419f8f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5943,9 +5943,8 @@ static void __sdt_free(const struct cpumask *cpu_map) } struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, - struct s_data *d, const struct cpumask *cpu_map, - struct sched_domain_attr *attr, struct sched_domain *child, - int cpu) + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *child, int cpu) { struct sched_domain *sd = tl->init(tl, cpu); if (!sd) @@ -5985,7 +5984,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, sd = NULL; for (tl = sched_domain_topology; tl->init; tl++) { - sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); + sd = build_sched_domain(tl, cpu_map, attr, sd, i); if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) From 22da956953f371c1ee7a578c31ed8c5702cb52b1 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 4 Jun 2013 15:41:15 +0530 Subject: [PATCH 18/42] sched: Optimize build_sched_domains() for saving first SD node for a cpu We are saving first scheduling domain for a cpu in build_sched_domains() by iterating over the nested sd->child list. We don't actually need to do it this way. tl will be equal to sched_domain_topology for the first iteration and so we can set *per_cpu_ptr(d.sd, i) based on that. So, save pointer to first SD while running the iteration loop over tl's. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/fc473527cbc4dfa0b8eeef2a59db74684eb59a83.1370436120.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 342e74419f8f..137dcc03f66d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5985,16 +5985,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, sd = NULL; for (tl = sched_domain_topology; tl->init; tl++) { sd = build_sched_domain(tl, cpu_map, attr, sd, i); + if (tl == sched_domain_topology) + *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; } - - while (sd->child) - sd = sd->child; - - *per_cpu_ptr(d.sd, i) = sd; } /* Build the groups for the domains */ From 1c6321694074163b5863c13d71c19ca953a3fb08 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 10 Jun 2013 16:27:18 +0530 Subject: [PATCH 19/42] sched: Don't initialize alloc_state in build_sched_domains() alloc_state will be overwritten by __visit_domain_allocation_hell() and so we don't actually need to initialize alloc_state. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/df57734a075cc5ad130e1ae498702e24f2529ab8.1370861520.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 137dcc03f66d..3de62649869b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5969,7 +5969,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { - enum s_alloc alloc_state = sa_none; + enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; int i, ret = -ENOMEM; From c75e01288ce9c9a6b7beb6b23c07d2e4d1db8c84 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 10 Jun 2013 16:27:19 +0530 Subject: [PATCH 20/42] sched: Don't set sd->child to NULL when it is already NULL Memory for sd is allocated with kzalloc_node() which will initialize its fields with zero. In build_sched_domain() we are setting sd->child to child even if child is NULL, which isn't required. Lets do it only if child isn't NULL. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/f4753a1730051341003ad2ad29a3229c7356678e.1370861520.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3de62649869b..88c2c0ee5a52 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5955,8 +5955,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; + sd->child = child; } - sd->child = child; set_domain_attribute(sd, attr); return sd; From 27723a68caf05381b0b0bc6e127da2c9e7bcb775 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 10 Jun 2013 16:27:20 +0530 Subject: [PATCH 21/42] sched: Create for_each_sd_topology() For loop for traversing sched_domain_topology was used at multiple placed in core.c. This patch removes code redundancy by creating for_each_sd_topology(). Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/e0e04542f54e9464bd9da54f5ccfe62ec6c4c0bc.1370861520.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 88c2c0ee5a52..547b7d3ff893 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5565,6 +5565,9 @@ static struct sched_domain_topology_level default_topology[] = { static struct sched_domain_topology_level *sched_domain_topology = default_topology; +#define for_each_sd_topology(tl) \ + for (tl = sched_domain_topology; tl->init; tl++) + #ifdef CONFIG_NUMA static int sched_domains_numa_levels; @@ -5862,7 +5865,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) struct sched_domain_topology_level *tl; int j; - for (tl = sched_domain_topology; tl->init; tl++) { + for_each_sd_topology(tl) { struct sd_data *sdd = &tl->data; sdd->sd = alloc_percpu(struct sched_domain *); @@ -5915,7 +5918,7 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sched_domain_topology_level *tl; int j; - for (tl = sched_domain_topology; tl->init; tl++) { + for_each_sd_topology(tl) { struct sd_data *sdd = &tl->data; for_each_cpu(j, cpu_map) { @@ -5983,7 +5986,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_topology_level *tl; sd = NULL; - for (tl = sched_domain_topology; tl->init; tl++) { + for_each_sd_topology(tl) { sd = build_sched_domain(tl, cpu_map, attr, sd, i); if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; From 0936629f01bb1b11772db8c36be421365238cbec Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 11 Jun 2013 16:32:43 +0530 Subject: [PATCH 22/42] sched: Use cached value of span instead of calling sched_domain_span() In the beginning of build_sched_groups() we called sched_domain_span() and cached its return value in span. Few statements later we are calling it again to get the same pointer. Lets use the cached value instead as it hasn't changed in between. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/834ecd507071ad88aff039352dbc7e063dd996a7.1370948150.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 547b7d3ff893..3388387e1330 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5347,7 +5347,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) get_group(cpu, sdd, &sd->groups); atomic_inc(&sd->groups->ref); - if (cpu != cpumask_first(sched_domain_span(sd))) + if (cpu != cpumask_first(span)) return 0; lockdep_assert_held(&sched_domains_mutex); From cd08e9234c987766ad077bba80eb5a07d0855525 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 11 Jun 2013 16:32:44 +0530 Subject: [PATCH 23/42] sched: Fix memory leakage in build_sched_groups() In build_sched_groups() we don't need to call get_group() for cpus which are already covered in previous iterations. Calling get_group() would mark the group used and eventually leak it since we wouldn't connect it and not find it again to free it. This will happen only in cases where sg->cpumask contained more than one cpu (For any topology level). This patch would free sg's memory for all cpus leaving the group leader as the group isn't marked used now. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/7a61e955abdcbb1dfa9fe493f11a5ec53a11ddd3.1370948150.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3388387e1330..014c97f00732 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5357,12 +5357,12 @@ build_sched_groups(struct sched_domain *sd, int cpu) for_each_cpu(i, span) { struct sched_group *sg; - int group = get_group(i, sdd, &sg); - int j; + int group, j; if (cpumask_test_cpu(i, covered)) continue; + group = get_group(i, sdd, &sg); cpumask_clear(sched_group_cpus(sg)); sg->sgp->power = 0; cpumask_setall(sched_group_mask(sg)); From 94c95ba69f31e435416988ddb223c92e5b0e9e83 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 11 Jun 2013 16:32:45 +0530 Subject: [PATCH 24/42] sched: Remove WARN_ON(!sd) from init_sched_groups_power() sd can't be NULL in init_sched_groups_power() and so checking it for NULL isn't useful. In case it is required, then also we need to rearrange the code a bit as we already accessed invalid pointer sd to get sg: sg = sd->groups. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/2bbe633cd74b431c05253a8ce61fdfd5066a531b.1370948150.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 014c97f00732..21b1403a10a2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5400,7 +5400,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; - WARN_ON(!sd || !sg); + WARN_ON(!sg); do { sg->group_weight = cpumask_weight(sched_group_cpus(sg)); From be7002e6c613d22976f2b8d4bae6121a5fc0433a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 12 Jun 2013 11:55:36 -0700 Subject: [PATCH 25/42] sched: Don't mix use of typedef ctl_table and struct ctl_table Just use struct ctl_table. Signed-off-by: Joe Perches Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371063336.2069.22.camel@joe-AO722 Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 21b1403a10a2..ceeaf0f45be0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4533,7 +4533,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) return table; } -static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) { struct ctl_table *entry, *table; struct sched_domain *sd; From 141965c7494d984b2bf24efd361a3125278869c6 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 26 Jun 2013 13:05:39 +0800 Subject: [PATCH 26/42] Revert "sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking" Remove CONFIG_FAIR_GROUP_SCHED that covers the runnable info, then we can use runnable load variables. Also remove 2 CONFIG_FAIR_GROUP_SCHED setting which is not in reverted patch(introduced in 9ee474f), but also need to revert. Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/51CA76A3.3050207@intel.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 +------ kernel/sched/core.c | 7 +------ kernel/sched/fair.c | 17 ++++------------- kernel/sched/sched.h | 19 ++----------------- 4 files changed, 8 insertions(+), 42 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 178a8d909f14..0019befea59a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -994,12 +994,7 @@ struct sched_entity { struct cfs_rq *my_q; #endif -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP /* Per-entity load-tracking */ struct sched_avg avg; #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ceeaf0f45be0..0241b1b55a04 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1611,12 +1611,7 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c0ac2c3b56e1..36eadaaa4e5b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1128,8 +1128,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) } #endif /* CONFIG_FAIR_GROUP_SCHED */ -/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP /* * We choose a half-life close to 1 scheduling period. * Note: The tables below are dependent on this value. @@ -3430,12 +3429,6 @@ unlock: return new_cpu; } -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#ifdef CONFIG_FAIR_GROUP_SCHED /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -3459,7 +3452,6 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } } -#endif #endif /* CONFIG_SMP */ static unsigned long @@ -5861,7 +5853,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; } -#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP /* * Remove our load from contribution when we leave sched_fair * and ensure we don't carry in an old decay_count if we @@ -5920,7 +5912,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif -#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP atomic64_set(&cfs_rq->decay_counter, 1); atomic64_set(&cfs_rq->removed_load, 0); #endif @@ -6162,9 +6154,8 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, -#ifdef CONFIG_FAIR_GROUP_SCHED .migrate_task_rq = migrate_task_rq_fair, -#endif + .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 029601a61587..77ce668ba302 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -269,12 +269,6 @@ struct cfs_rq { #endif #ifdef CONFIG_SMP -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#ifdef CONFIG_FAIR_GROUP_SCHED /* * CFS Load tracking * Under CFS, load is tracked on a per-entity basis and aggregated up. @@ -284,9 +278,9 @@ struct cfs_rq { u64 runnable_load_avg, blocked_load_avg; atomic64_t decay_counter, removed_load; u64 last_decay; -#endif /* CONFIG_FAIR_GROUP_SCHED */ -/* These always depend on CONFIG_FAIR_GROUP_SCHED */ + #ifdef CONFIG_FAIR_GROUP_SCHED + /* Required to track per-cpu representation of a task_group */ u32 tg_runnable_contrib; u64 tg_load_contrib; #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -1027,17 +1021,8 @@ extern void update_group_power(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq, int cpu); extern void idle_balance(int this_cpu, struct rq *this_rq); -/* - * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg - * becomes useful in lb - */ -#if defined(CONFIG_FAIR_GROUP_SCHED) extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); -#else -static inline void idle_enter_fair(struct rq *this_rq) {} -static inline void idle_exit_fair(struct rq *this_rq) {} -#endif #else /* CONFIG_SMP */ From fa6bddeb14d59d701f846b174b59c9982e926e66 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:46 +0800 Subject: [PATCH 27/42] sched: Move a few runnable tg variables into CONFIG_SMP The following 2 variables are only used under CONFIG_SMP, so its better to move their definiation into CONFIG_SMP too. atomic64_t load_avg; atomic_t runnable_avg; Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-3-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 77ce668ba302..31d25f80a7c6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -149,9 +149,11 @@ struct task_group { unsigned long shares; atomic_t load_weight; +#ifdef CONFIG_SMP atomic64_t load_avg; atomic_t runnable_avg; #endif +#endif #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; From a75cdaa915e42ef0e6f38dc7f2a6a1deca91d648 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:47 +0800 Subject: [PATCH 28/42] sched: Set an initial value of runnable avg for new forked task We need to initialize the se.avg.{decay_count, load_avg_contrib} for a new forked task. Otherwise random values of above variables cause a mess when a new task is enqueued: enqueue_task_fair enqueue_entity enqueue_entity_load_avg and make fork balancing imbalance due to incorrect load_avg_contrib. Further more, Morten Rasmussen notice some tasks were not launched at once after created. So Paul and Peter suggest giving a start value for new task runnable avg time same as sched_slice(). PeterZ said: > So the 'problem' is that our running avg is a 'floating' average; ie. it > decays with time. Now we have to guess about the future of our newly > spawned task -- something that is nigh impossible seeing these CPU > vendors keep refusing to implement the crystal ball instruction. > > So there's two asymptotic cases we want to deal well with; 1) the case > where the newly spawned program will be 'nearly' idle for its lifetime; > and 2) the case where its cpu-bound. > > Since we have to guess, we'll go for worst case and assume its > cpu-bound; now we don't want to make the avg so heavy adjusting to the > near-idle case takes forever. We want to be able to quickly adjust and > lower our running avg. > > Now we also don't want to make our avg too light, such that it gets > decremented just for the new task not having had a chance to run yet -- > even if when it would run, it would be more cpu-bound than not. > > So what we do is we make the initial avg of the same duration as that we > guess it takes to run each task on the system at least once -- aka > sched_slice(). > > Of course we can defeat this with wakeup/fork bombs, but in the 'normal' > case it should be good enough. Paul also contributed most of the code comments in this commit. Signed-off-by: Alex Shi Reviewed-by: Gu Zheng Reviewed-by: Paul Turner [peterz; added explanation of sched_slice() usage] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-4-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++---- kernel/sched/fair.c | 24 ++++++++++++++++++++++++ kernel/sched/sched.h | 2 ++ 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0241b1b55a04..729e7fc7634b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1611,10 +1611,6 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); -#ifdef CONFIG_SMP - p->se.avg.runnable_avg_period = 0; - p->se.avg.runnable_avg_sum = 0; -#endif #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif @@ -1758,6 +1754,8 @@ void wake_up_new_task(struct task_struct *p) set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); #endif + /* Initialize new task's runnable average */ + init_task_runnable_average(p); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = 1; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 36eadaaa4e5b..e1602a0fdbf8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -680,6 +680,26 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } +#ifdef CONFIG_SMP +static inline void __update_task_entity_contrib(struct sched_entity *se); + +/* Give new task start runnable values to heavy its load in infant time */ +void init_task_runnable_average(struct task_struct *p) +{ + u32 slice; + + p->se.avg.decay_count = 0; + slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; + p->se.avg.runnable_avg_sum = slice; + p->se.avg.runnable_avg_period = slice; + __update_task_entity_contrib(&p->se); +} +#else +void init_task_runnable_average(struct task_struct *p) +{ +} +#endif + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -1527,6 +1547,10 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, * We track migrations using entity decay_count <= 0, on a wake-up * migration we use a negative decay count to track the remote decays * accumulated while sleeping. + * + * Newly forked tasks are enqueued with se->avg.decay_count == 0, they + * are seen by enqueue_entity_load_avg() as a migration with an already + * constructed load_avg_contrib. */ if (unlikely(se->avg.decay_count <= 0)) { se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 31d25f80a7c6..9c65d46504b1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1048,6 +1048,8 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern void update_idle_cpu_load(struct rq *this_rq); +extern void init_task_runnable_average(struct task_struct *p); + #ifdef CONFIG_PARAVIRT static inline u64 steal_ticks(u64 steal) { From 282cf499f03ec1754b6c8c945c9674b02631fb0f Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:48 +0800 Subject: [PATCH 29/42] sched: Fix sleep time double accounting in enqueue entity The woken migrated task will __synchronize_entity_decay(se); in migrate_task_rq_fair, then it needs to set `se->avg.last_runnable_update -= (-se->avg.decay_count) << 20' before update_entity_load_avg, in order to avoid sleep time is updated twice for se.avg.load_avg_contrib in both __syncchronize and update_entity_load_avg. However if the sleeping task is woken up from the same cpu, it miss the last_runnable_update before update_entity_load_avg(se, 0, 1), then the sleep time was used twice in both functions. So we need to remove the double sleep time accounting. Paul also contributed some code comments in this commit. Signed-off-by: Alex Shi Reviewed-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-5-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e1602a0fdbf8..9bbc303598ea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1571,7 +1571,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, } wakeup = 0; } else { - __synchronize_entity_decay(se); + /* + * Task re-woke on same cpu (or else migrate_task_rq_fair() + * would have made count negative); we must be careful to avoid + * double-accounting blocked time after synchronizing decays. + */ + se->avg.last_runnable_update += __synchronize_entity_decay(se) + << 20; } /* migrated tasks did not contribute to our blocked load */ From 83dfd5235ebd66c284b97befe6eabff7132333e6 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:49 +0800 Subject: [PATCH 30/42] sched: Update cpu load after task_tick To get the latest runnable info, we need do this cpuload update after task_tick. Signed-off-by: Alex Shi Reviewed-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-6-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 729e7fc7634b..08746cc12370 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2165,8 +2165,8 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); - update_cpu_load_active(rq); curr->sched_class->task_tick(rq, curr, 0); + update_cpu_load_active(rq); raw_spin_unlock(&rq->lock); perf_event_task_tick(); From b92486cbf2aa230d00f160664858495c81d2b37b Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:50 +0800 Subject: [PATCH 31/42] sched: Compute runnable load avg in cpu_load and cpu_avg_load_per_task They are the base values in load balance, update them with rq runnable load average, then the load balance will consider runnable load avg naturally. We also try to include the blocked_load_avg as cpu load in balancing, but that cause kbuild performance drop 6% on every Intel machine, and aim7/oltp drop on some of 4 CPU sockets machines. Or only add blocked_load_avg into get_rq_runable_load, hackbench still drop a little on NHM EX. Signed-off-by: Alex Shi Reviewed-by: Gu Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-7-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++-- kernel/sched/proc.c | 17 +++++++++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9bbc303598ea..e6d82cae4910 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2963,7 +2963,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(const int cpu) { - return cpu_rq(cpu)->load.weight; + return cpu_rq(cpu)->cfs.runnable_load_avg; } /* @@ -3008,9 +3008,10 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + unsigned long load_avg = rq->cfs.runnable_load_avg; if (nr_running) - return rq->load.weight / nr_running; + return load_avg / nr_running; return 0; } diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index bb3a6a0b8623..ce5cd4892e43 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -501,6 +501,18 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } +#ifdef CONFIG_SMP +unsigned long get_rq_runnable_load(struct rq *rq) +{ + return rq->cfs.runnable_load_avg; +} +#else +unsigned long get_rq_runnable_load(struct rq *rq) +{ + return rq->load.weight; +} +#endif + #ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the @@ -522,7 +534,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = this_rq->load.weight; + unsigned long load = get_rq_runnable_load(this_rq); unsigned long pending_updates; /* @@ -568,11 +580,12 @@ void update_cpu_load_nohz(void) */ void update_cpu_load_active(struct rq *this_rq) { + unsigned long load = get_rq_runnable_load(this_rq); /* * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, this_rq->load.weight, 1); + __update_cpu_load(this_rq, load, 1); calc_load_account_active(this_rq); } From a003a25b227d59ded9197ced109517f037d01c27 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:51 +0800 Subject: [PATCH 32/42] sched: Consider runnable load average in move_tasks() Aside from using runnable load average in background, move_tasks is also the key function in load balance. We need consider the runnable load average in it in order to make it an apple to apple load comparison. Morten had caught a div u64 bug on ARM, thanks! Thanks-to: Morten Rasmussen Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-8-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e6d82cae4910..7948bb825985 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4179,11 +4179,14 @@ static int tg_load_down(struct task_group *tg, void *data) long cpu = (long)data; if (!tg->parent) { - load = cpu_rq(cpu)->load.weight; + load = cpu_rq(cpu)->avg.load_avg_contrib; } else { + unsigned long tmp_rla; + tmp_rla = tg->parent->cfs_rq[cpu]->runnable_load_avg + 1; + load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->se[cpu]->load.weight; - load /= tg->parent->cfs_rq[cpu]->load.weight + 1; + load *= tg->se[cpu]->avg.load_avg_contrib; + load /= tmp_rla; } tg->cfs_rq[cpu]->h_load = load; @@ -4209,12 +4212,9 @@ static void update_h_load(long cpu) static unsigned long task_h_load(struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); - unsigned long load; - load = p->se.load.weight; - load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); - - return load; + return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, + cfs_rq->runnable_load_avg + 1); } #else static inline void update_blocked_averages(int cpu) @@ -4227,7 +4227,7 @@ static inline void update_h_load(long cpu) static unsigned long task_h_load(struct task_struct *p) { - return p->se.load.weight; + return p->se.avg.load_avg_contrib; } #endif From 72a4cf20cb71a327c636c7042fdacc25abffc87c Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:53 +0800 Subject: [PATCH 33/42] sched: Change cfs_rq load avg to unsigned long Since the 'u64 runnable_load_avg, blocked_load_avg' in cfs_rq struct are smaller than 'unsigned long' cfs_rq->load.weight. We don't need u64 vaiables to describe them. unsigned long is more efficient and convenience. Signed-off-by: Alex Shi Reviewed-by: Paul Turner Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-10-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 4 ++-- kernel/sched/fair.c | 7 ++----- kernel/sched/sched.h | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 75024a673520..160afdc5cdff 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -211,9 +211,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", + SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", cfs_rq->runnable_load_avg); - SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", + SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg", (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7948bb825985..f19772de1b1c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4181,12 +4181,9 @@ static int tg_load_down(struct task_group *tg, void *data) if (!tg->parent) { load = cpu_rq(cpu)->avg.load_avg_contrib; } else { - unsigned long tmp_rla; - tmp_rla = tg->parent->cfs_rq[cpu]->runnable_load_avg + 1; - load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->se[cpu]->avg.load_avg_contrib; - load /= tmp_rla; + load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib, + tg->parent->cfs_rq[cpu]->runnable_load_avg + 1); } tg->cfs_rq[cpu]->h_load = load; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9c65d46504b1..9eb12d9edd35 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -277,7 +277,7 @@ struct cfs_rq { * This allows for the description of both thread and group usage (in * the FAIR_GROUP_SCHED case). */ - u64 runnable_load_avg, blocked_load_avg; + unsigned long runnable_load_avg, blocked_load_avg; atomic64_t decay_counter, removed_load; u64 last_decay; From bf5b986ed4d20428eeec3df4a03dbfebb9b6538c Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:54 +0800 Subject: [PATCH 34/42] sched/tg: Use 'unsigned long' for load variable in task group Since tg->load_avg is smaller than tg->load_weight, we don't need a atomic64_t variable for load_avg in 32 bit machine. The same reason for cfs_rq->tg_load_contrib. The atomic_long_t/unsigned long variable type are more efficient and convenience for them. Signed-off-by: Alex Shi Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-11-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 6 +++--- kernel/sched/fair.c | 12 ++++++------ kernel/sched/sched.h | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 160afdc5cdff..d803989defc0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -215,9 +215,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); - SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg", - (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg)); - SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", + atomic_long_read(&cfs_rq->tg->load_avg)); + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", cfs_rq->tg_runnable_contrib); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f19772de1b1c..30ccc37112d0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1075,7 +1075,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) * to gain a more accurate current total weight. See * update_cfs_rq_load_contribution(). */ - tg_weight = atomic64_read(&tg->load_avg); + tg_weight = atomic_long_read(&tg->load_avg); tg_weight -= cfs_rq->tg_load_contrib; tg_weight += cfs_rq->load.weight; @@ -1356,13 +1356,13 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, int force_update) { struct task_group *tg = cfs_rq->tg; - s64 tg_contrib; + long tg_contrib; tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; tg_contrib -= cfs_rq->tg_load_contrib; - if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { - atomic64_add(tg_contrib, &tg->load_avg); + if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { + atomic_long_add(tg_contrib, &tg->load_avg); cfs_rq->tg_load_contrib += tg_contrib; } } @@ -1397,8 +1397,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) u64 contrib; contrib = cfs_rq->tg_load_contrib * tg->shares; - se->avg.load_avg_contrib = div64_u64(contrib, - atomic64_read(&tg->load_avg) + 1); + se->avg.load_avg_contrib = div_u64(contrib, + atomic_long_read(&tg->load_avg) + 1); /* * For group entities we need to compute a correction term in the case diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9eb12d9edd35..5585eb25e9a3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -150,7 +150,7 @@ struct task_group { atomic_t load_weight; #ifdef CONFIG_SMP - atomic64_t load_avg; + atomic_long_t load_avg; atomic_t runnable_avg; #endif #endif @@ -284,7 +284,7 @@ struct cfs_rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* Required to track per-cpu representation of a task_group */ u32 tg_runnable_contrib; - u64 tg_load_contrib; + unsigned long tg_load_contrib; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* From 2509940fd71c2e2915a05052bbdbf2d478364184 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:55 +0800 Subject: [PATCH 35/42] sched/cfs_rq: Change atomic64_t removed_load to atomic_long_t Similar to runnable_load_avg, blocked_load_avg variable, long type is enough for removed_load in 64 bit or 32 bit machine. Then we avoid the expensive atomic64 operations on 32 bit machine. Signed-off-by: Alex Shi Reviewed-by: Paul Turner Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-12-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 ++++++---- kernel/sched/sched.h | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 30ccc37112d0..b43474a964c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1517,8 +1517,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) if (!decays && !force_update) return; - if (atomic64_read(&cfs_rq->removed_load)) { - u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); + if (atomic_long_read(&cfs_rq->removed_load)) { + unsigned long removed_load; + removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0); subtract_blocked_load_contrib(cfs_rq, removed_load); } @@ -3480,7 +3481,8 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) */ if (se->avg.decay_count) { se->avg.decay_count = -__synchronize_entity_decay(se); - atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); + atomic_long_add(se->avg.load_avg_contrib, + &cfs_rq->removed_load); } } #endif /* CONFIG_SMP */ @@ -5942,7 +5944,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #endif #ifdef CONFIG_SMP atomic64_set(&cfs_rq->decay_counter, 1); - atomic64_set(&cfs_rq->removed_load, 0); + atomic_long_set(&cfs_rq->removed_load, 0); #endif } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5585eb25e9a3..705991906fbe 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -278,8 +278,9 @@ struct cfs_rq { * the FAIR_GROUP_SCHED case). */ unsigned long runnable_load_avg, blocked_load_avg; - atomic64_t decay_counter, removed_load; + atomic64_t decay_counter; u64 last_decay; + atomic_long_t removed_load; #ifdef CONFIG_FAIR_GROUP_SCHED /* Required to track per-cpu representation of a task_group */ From a9cef46a10cc1b84bf2cdf4060766d858c0439d8 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:56 +0800 Subject: [PATCH 36/42] sched/tg: Remove tg.load_weight Since no one use it. Signed-off-by: Alex Shi Reviewed-by: Paul Turner Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-13-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 705991906fbe..ef0a7b2439dd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -148,7 +148,6 @@ struct task_group { struct cfs_rq **cfs_rq; unsigned long shares; - atomic_t load_weight; #ifdef CONFIG_SMP atomic_long_t load_avg; atomic_t runnable_avg; From a9dc5d0e33c677619e4b97a38c23db1a42857905 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:57 +0800 Subject: [PATCH 37/42] sched: Change get_rq_runnable_load() to static and inline Based-on-patch-by: Fengguang Wu Signed-off-by: Alex Shi Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-14-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index ce5cd4892e43..16f5a30f9c88 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -502,12 +502,12 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, } #ifdef CONFIG_SMP -unsigned long get_rq_runnable_load(struct rq *rq) +static inline unsigned long get_rq_runnable_load(struct rq *rq) { return rq->cfs.runnable_load_avg; } #else -unsigned long get_rq_runnable_load(struct rq *rq) +static inline unsigned long get_rq_runnable_load(struct rq *rq) { return rq->load.weight; } From 939fd731eb88a0cdd9058d0b0143563172a217d7 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Tue, 25 Jun 2013 13:33:36 +0530 Subject: [PATCH 38/42] sched/debug: Add load-tracking statistics to task At present we print per-entity load-tracking statistics for cfs_rq of cgroups/runqueues. Given that per task statistics is maintained, it can be used to know the contribution made by the task to its parenting cfs_rq level. This patch adds per-task load-tracking statistics to /proc//sched. Signed-off-by: Kamalesh Babulal Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130625080336.GA20175@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index d803989defc0..626320985366 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -566,6 +566,12 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + P(se.avg.runnable_avg_sum); + P(se.avg.runnable_avg_period); + P(se.avg.load_avg_contrib); + P(se.avg.decay_count); +#endif P(policy); P(prio); #undef PN From 0fc576d592bd137437fdeb059738b789e642b744 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Thu, 27 Jun 2013 11:24:18 +0530 Subject: [PATCH 39/42] sched/fair: Fix typo describing flags in enqueue_entity Fix spelling of 'calling' in description of se flags in enqueue_entity(). Signed-off-by: Kamalesh Babulal Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/20130627055418.GA18582@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b43474a964c2..f77f9c527449 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1760,7 +1760,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Update the normalized vruntime before updating min_vruntime - * through callig update_curr(). + * through calling update_curr(). */ if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) se->vruntime += cfs_rq->min_vruntime; From 239003ea2e30374d1cdfee788867e497cca2366c Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Thu, 27 Jun 2013 11:34:09 +0530 Subject: [PATCH 40/42] sched: Fix typo in struct sched_avg member description Remove extra 'for' from the description about member of struct sched_avg. Signed-off-by: Kamalesh Babulal Cc: pjt@google.com Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/20130627060409.GB18582@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 0019befea59a..ec80684a0127 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -924,7 +924,7 @@ struct load_weight { struct sched_avg { /* * These sums represent an infinite geometric series and so are bound - * above by 1024/(1-y). Thus we only need a u32 to store them for for all + * above by 1024/(1-y). Thus we only need a u32 to store them for all * choices of y < 1-2^(-32)*1024. */ u32 runnable_avg_sum, runnable_avg_period; From add332a1523a09cf6d429933f1e2fb4ccdfe6479 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Thu, 27 Jun 2013 22:20:05 +0530 Subject: [PATCH 41/42] sched/debug: Fix formatting of /proc//sched This patch alters format string's width, to align all statistics at par with the longest struct sched_statistic member name under /proc//sched. Signed-off-by: Kamalesh Babulal Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/20130627165005.GA15583@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 626320985366..159561415d13 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -493,15 +493,16 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, get_nr_threads(p)); SEQ_printf(m, - "---------------------------------------------------------\n"); + "---------------------------------------------------------" + "----------\n"); #define __P(F) \ - SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) #define P(F) \ - SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) #define __PN(F) \ - SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) #define PN(F) \ - SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) PN(se.exec_start); PN(se.vruntime); @@ -560,9 +561,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) } #endif __P(nr_switches); - SEQ_printf(m, "%-35s:%21Ld\n", + SEQ_printf(m, "%-45s:%21Ld\n", "nr_voluntary_switches", (long long)p->nvcsw); - SEQ_printf(m, "%-35s:%21Ld\n", + SEQ_printf(m, "%-45s:%21Ld\n", "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); @@ -585,7 +586,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) t0 = cpu_clock(this_cpu); t1 = cpu_clock(this_cpu); - SEQ_printf(m, "%-35s:%21Ld\n", + SEQ_printf(m, "%-45s:%21Ld\n", "clock-delta", (long long)(t1-t0)); } } From 333bb864f192015a53b5060b829089decd0220ef Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 28 Jun 2013 19:10:35 +0800 Subject: [PATCH 42/42] sched/debug: Remove CONFIG_FAIR_GROUP_SCHED mask Now that we are using runnable load avg in sched balance, we don't need to keep it under CONFIG_FAIR_GROUP_SCHED. Also align the code style to #ifdef instead of #if defined() and reorder the tg output info. Signed-off-by: Alex Shi Cc: pjt@google.com Cc: kamalesh@linux.vnet.ibm.com Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1372417835-4698-1-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 159561415d13..e076bddd4c66 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -209,22 +209,24 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", - atomic_long_read(&cfs_rq->tg->load_avg)); +#ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", + atomic_long_read(&cfs_rq->tg->load_avg)); SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", atomic_read(&cfs_rq->tg->runnable_avg)); #endif +#endif +#ifdef CONFIG_FAIR_GROUP_SCHED print_cfs_group_stats(m, cpu, cfs_rq->tg); #endif } @@ -567,7 +569,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP P(se.avg.runnable_avg_sum); P(se.avg.runnable_avg_period); P(se.avg.load_avg_contrib);