From e4e4e534faa3c2be4e165ce414f44b76ada7208c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 14 Apr 2008 08:50:02 +0200 Subject: [PATCH 1/6] sched clock: revert various sched_clock() changes Found an interactivity problem on a quad core test-system - simple CPU loops would occasionally delay the system un an unacceptable way. After much debugging with Peter Zijlstra it turned out that the problem is caused by the string of sched_clock() changes - they caused the CPU clock to jump backwards a bit - which confuses the scheduler arithmetics. (which is unsigned for performance reasons) So revert: # c300ba2: sched_clock: and multiplier for TSC to gtod drift # c0c8773: sched_clock: only update deltas with local reads. # af52a90: sched_clock: stop maximum check on NO HZ # f7cce27: sched_clock: widen the max and min time This solves the interactivity problems. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mike Galbraith --- include/linux/sched.h | 17 +----- kernel/sched_clock.c | 109 +++++---------------------------------- kernel/time/tick-sched.c | 2 - 3 files changed, 14 insertions(+), 114 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5270d449ff9d..ea436bc1a0e2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1572,28 +1572,13 @@ static inline void sched_clock_idle_sleep_event(void) static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } - -#ifdef CONFIG_NO_HZ -static inline void sched_clock_tick_stop(int cpu) -{ -} - -static inline void sched_clock_tick_start(int cpu) -{ -} -#endif - -#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +#else extern void sched_clock_init(void); extern u64 sched_clock_cpu(int cpu); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); -#ifdef CONFIG_NO_HZ -extern void sched_clock_tick_stop(int cpu); -extern void sched_clock_tick_start(int cpu); #endif -#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 5a2dc7d8fd98..9a7844158ae8 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -44,11 +44,6 @@ unsigned long long __attribute__((weak)) sched_clock(void) #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -#define MULTI_SHIFT 15 -/* Max is double, Min is 1/2 */ -#define MAX_MULTI (2LL << MULTI_SHIFT) -#define MIN_MULTI (1LL << (MULTI_SHIFT-1)) - struct sched_clock_data { /* * Raw spinlock - this is a special case: this might be called @@ -62,10 +57,6 @@ struct sched_clock_data { u64 tick_raw; u64 tick_gtod; u64 clock; - s64 multi; -#ifdef CONFIG_NO_HZ - int check_max; -#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); @@ -97,53 +88,18 @@ void sched_clock_init(void) scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; - scd->multi = 1 << MULTI_SHIFT; -#ifdef CONFIG_NO_HZ - scd->check_max = 1; -#endif } sched_clock_running = 1; } -#ifdef CONFIG_NO_HZ -/* - * The dynamic ticks makes the delta jiffies inaccurate. This - * prevents us from checking the maximum time update. - * Disable the maximum check during stopped ticks. - */ -void sched_clock_tick_stop(int cpu) -{ - struct sched_clock_data *scd = cpu_sdc(cpu); - - scd->check_max = 0; -} - -void sched_clock_tick_start(int cpu) -{ - struct sched_clock_data *scd = cpu_sdc(cpu); - - scd->check_max = 1; -} - -static int check_max(struct sched_clock_data *scd) -{ - return scd->check_max; -} -#else -static int check_max(struct sched_clock_data *scd) -{ - return 1; -} -#endif /* CONFIG_NO_HZ */ - /* * update the percpu scd from the raw @now value * * - filter out backward motion * - use jiffies to generate a min,max window to clip the raw values */ -static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time) +static void __update_sched_clock(struct sched_clock_data *scd, u64 now) { unsigned long now_jiffies = jiffies; long delta_jiffies = now_jiffies - scd->tick_jiffies; @@ -152,31 +108,16 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim s64 delta = now - scd->prev_raw; WARN_ON_ONCE(!irqs_disabled()); - - /* - * At schedule tick the clock can be just under the gtod. We don't - * want to push it too prematurely. - */ - min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC); - if (min_clock > TICK_NSEC) - min_clock -= TICK_NSEC / 2; + min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; if (unlikely(delta < 0)) { clock++; goto out; } - /* - * The clock must stay within a jiffie of the gtod. - * But since we may be at the start of a jiffy or the end of one - * we add another jiffy buffer. - */ - max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; + max_clock = min_clock + TICK_NSEC; - delta *= scd->multi; - delta >>= MULTI_SHIFT; - - if (unlikely(clock + delta > max_clock) && check_max(scd)) { + if (unlikely(clock + delta > max_clock)) { if (clock < max_clock) clock = max_clock; else @@ -189,12 +130,9 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim if (unlikely(clock < min_clock)) clock = min_clock; - if (time) - *time = clock; - else { - scd->prev_raw = now; - scd->clock = clock; - } + scd->prev_raw = now; + scd->tick_jiffies = now_jiffies; + scd->clock = clock; } static void lock_double_clock(struct sched_clock_data *data1, @@ -238,26 +176,21 @@ u64 sched_clock_cpu(int cpu) now -= scd->tick_gtod; __raw_spin_unlock(&my_scd->lock); - - __update_sched_clock(scd, now, &clock); - - __raw_spin_unlock(&scd->lock); - } else { __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now, NULL); - clock = scd->clock; - __raw_spin_unlock(&scd->lock); } + __update_sched_clock(scd, now); + clock = scd->clock; + + __raw_spin_unlock(&scd->lock); + return clock; } void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); - unsigned long now_jiffies = jiffies; - s64 mult, delta_gtod, delta_raw; u64 now, now_gtod; if (unlikely(!sched_clock_running)) @@ -269,29 +202,14 @@ void sched_clock_tick(void) now = sched_clock(); __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now, NULL); + __update_sched_clock(scd, now); /* * update tick_gtod after __update_sched_clock() because that will * already observe 1 new jiffy; adding a new tick_gtod to that would * increase the clock 2 jiffies. */ - delta_gtod = now_gtod - scd->tick_gtod; - delta_raw = now - scd->tick_raw; - - if ((long)delta_raw > 0) { - mult = delta_gtod << MULTI_SHIFT; - do_div(mult, delta_raw); - scd->multi = mult; - if (scd->multi > MAX_MULTI) - scd->multi = MAX_MULTI; - else if (scd->multi < MIN_MULTI) - scd->multi = MIN_MULTI; - } else - scd->multi = 1 << MULTI_SHIFT; - scd->tick_raw = now; scd->tick_gtod = now_gtod; - scd->tick_jiffies = now_jiffies; __raw_spin_unlock(&scd->lock); } @@ -321,7 +239,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) __raw_spin_lock(&scd->lock); scd->prev_raw = now; scd->clock += delta_ns; - scd->multi = 1 << MULTI_SHIFT; __raw_spin_unlock(&scd->lock); touch_softlockup_watchdog(); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 825b4c00fe44..f5da526424a9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -289,7 +289,6 @@ void tick_nohz_stop_sched_tick(int inidle) ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; rcu_enter_nohz(); - sched_clock_tick_stop(cpu); } /* @@ -392,7 +391,6 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); - sched_clock_tick_start(cpu); cpu_clear(cpu, nohz_cpu_mask); /* From 50526968e99afbca34924abcb04658b6dd5c5ea5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 30 Jul 2008 09:39:48 +0200 Subject: [PATCH 2/6] sched clock: clean up sched_clock_cpu() - simplify the remote clock rebasing Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mike Galbraith --- kernel/sched_clock.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 9a7844158ae8..b96559cb96a5 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -169,11 +169,8 @@ u64 sched_clock_cpu(int cpu) lock_double_clock(scd, my_scd); - now -= my_scd->tick_raw; - now += scd->tick_raw; - - now += my_scd->tick_gtod; - now -= scd->tick_gtod; + now += scd->tick_raw - my_scd->tick_raw; + now += my_scd->tick_gtod - scd->tick_gtod; __raw_spin_unlock(&my_scd->lock); } else { From 18e4e36c66d6edbdefc639692206cdf01e468713 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 30 Jul 2008 10:13:35 +0200 Subject: [PATCH 3/6] sched: eliminate scd->prev_raw eliminate prev_raw and use tick_raw instead. It's enough to base the current time on the scheduler tick timestamp alone - the monotonicity and maximum checks will prevent any damage. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mike Galbraith --- kernel/sched_clock.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index b96559cb96a5..4b8474c966dc 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -53,7 +53,6 @@ struct sched_clock_data { raw_spinlock_t lock; unsigned long tick_jiffies; - u64 prev_raw; u64 tick_raw; u64 tick_gtod; u64 clock; @@ -84,7 +83,6 @@ void sched_clock_init(void) scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; scd->tick_jiffies = now_jiffies; - scd->prev_raw = 0; scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; @@ -105,7 +103,7 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) long delta_jiffies = now_jiffies - scd->tick_jiffies; u64 clock = scd->clock; u64 min_clock, max_clock; - s64 delta = now - scd->prev_raw; + s64 delta = now - scd->tick_raw; WARN_ON_ONCE(!irqs_disabled()); min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; @@ -130,7 +128,6 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) if (unlikely(clock < min_clock)) clock = min_clock; - scd->prev_raw = now; scd->tick_jiffies = now_jiffies; scd->clock = clock; } @@ -234,7 +231,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) * rq clock: */ __raw_spin_lock(&scd->lock); - scd->prev_raw = now; scd->clock += delta_ns; __raw_spin_unlock(&scd->lock); From 56b906126d33904d4d67615d0d5b95dbdf1f27ca Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 30 Jul 2008 10:15:55 +0200 Subject: [PATCH 4/6] sched clock: simplify __update_sched_clock() - return the current clock instead of letting callers fetch it from scd->clock Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mike Galbraith --- kernel/sched_clock.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 4b8474c966dc..857a1291fd23 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -97,7 +97,7 @@ void sched_clock_init(void) * - filter out backward motion * - use jiffies to generate a min,max window to clip the raw values */ -static void __update_sched_clock(struct sched_clock_data *scd, u64 now) +static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) { unsigned long now_jiffies = jiffies; long delta_jiffies = now_jiffies - scd->tick_jiffies; @@ -130,6 +130,8 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) scd->tick_jiffies = now_jiffies; scd->clock = clock; + + return clock; } static void lock_double_clock(struct sched_clock_data *data1, @@ -174,8 +176,7 @@ u64 sched_clock_cpu(int cpu) __raw_spin_lock(&scd->lock); } - __update_sched_clock(scd, now); - clock = scd->clock; + clock = __update_sched_clock(scd, now); __raw_spin_unlock(&scd->lock); From 4a273f209cc95d148f79b4c96d3d03997b44ffda Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 30 Jul 2008 10:22:07 +0200 Subject: [PATCH 5/6] sched clock: couple local and remote clocks When taking the time of a remote CPU, use the opportunity to couple (sync) the clocks to each other. (in a monotonic way) Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mike Galbraith --- kernel/sched_clock.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 857a1291fd23..074edc989379 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -149,7 +149,7 @@ static void lock_double_clock(struct sched_clock_data *data1, u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd = cpu_sdc(cpu); - u64 now, clock; + u64 now, clock, this_clock, remote_clock; if (unlikely(!sched_clock_running)) return 0ull; @@ -158,26 +158,36 @@ u64 sched_clock_cpu(int cpu) now = sched_clock(); if (cpu != raw_smp_processor_id()) { - /* - * in order to update a remote cpu's clock based on our - * unstable raw time rebase it against: - * tick_raw (offset between raw counters) - * tick_gotd (tick offset between cpus) - */ struct sched_clock_data *my_scd = this_scd(); lock_double_clock(scd, my_scd); - now += scd->tick_raw - my_scd->tick_raw; - now += my_scd->tick_gtod - scd->tick_gtod; + this_clock = __update_sched_clock(my_scd, now); + remote_clock = scd->clock; + + /* + * Use the opportunity that we have both locks + * taken to couple the two clocks: we take the + * larger time as the latest time for both + * runqueues. (this creates monotonic movement) + */ + if (likely(remote_clock < this_clock)) { + clock = this_clock; + scd->clock = clock; + } else { + /* + * Should be rare, but possible: + */ + clock = remote_clock; + my_scd->clock = remote_clock; + } __raw_spin_unlock(&my_scd->lock); } else { __raw_spin_lock(&scd->lock); + clock = __update_sched_clock(scd, now); } - clock = __update_sched_clock(scd, now); - __raw_spin_unlock(&scd->lock); return clock; @@ -223,7 +233,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); void sched_clock_idle_wakeup_event(u64 delta_ns) { struct sched_clock_data *scd = this_scd(); - u64 now = sched_clock(); /* * Override the previous timestamp and ignore all From c1955a3d4762e7a9bf84035eb3c4886a900f0d15 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Aug 2008 08:59:03 +0200 Subject: [PATCH 6/6] sched_clock: delay using sched_clock() Some arch's can't handle sched_clock() being called too early - delay this until sched_clock_init() has been called. Reported-by: Bill Gatliff Signed-off-by: Peter Zijlstra Tested-by: Nishanth Aravamudan CC: Russell King - ARM Linux Signed-off-by: Ingo Molnar --- include/linux/sched.h | 14 +++----------- kernel/sched_clock.c | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index ea436bc1a0e2..5850bfb968a8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1551,16 +1551,10 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) extern unsigned long long sched_clock(void); +extern void sched_clock_init(void); +extern u64 sched_clock_cpu(int cpu); + #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline void sched_clock_init(void) -{ -} - -static inline u64 sched_clock_cpu(int cpu) -{ - return sched_clock(); -} - static inline void sched_clock_tick(void) { } @@ -1573,8 +1567,6 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } #else -extern void sched_clock_init(void); -extern u64 sched_clock_cpu(int cpu); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 074edc989379..204991a0bfa7 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -42,6 +42,8 @@ unsigned long long __attribute__((weak)) sched_clock(void) return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); } +static __read_mostly int sched_clock_running; + #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK struct sched_clock_data { @@ -70,8 +72,6 @@ static inline struct sched_clock_data *cpu_sdc(int cpu) return &per_cpu(sched_clock_data, cpu); } -static __read_mostly int sched_clock_running; - void sched_clock_init(void) { u64 ktime_now = ktime_to_ns(ktime_get()); @@ -248,6 +248,21 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); +#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ + +void sched_clock_init(void) +{ + sched_clock_running = 1; +} + +u64 sched_clock_cpu(int cpu) +{ + if (unlikely(!sched_clock_running)) + return 0; + + return sched_clock(); +} + #endif unsigned long long cpu_clock(int cpu)