From 84599f8a59e77699f18f06948cea171a349a3f0f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Jun 2009 12:34:17 -0700 Subject: [PATCH 1/4] sched, x86: Fix cpufreq + sched_clock() TSC scaling For freqency dependent TSCs we only scale the cycles, we do not account for the discrepancy in absolute value. Our current formula is: time = cycles * mult (where mult is a function of the cpu-speed on variable tsc machines) Suppose our current cycle count is 10, and we have a multiplier of 5, then our time value would end up being 50. Now cpufreq comes along and changes the multiplier to say 3 or 7, which would result in our time being resp. 30 or 70. That means that we can observe random jumps in the time value due to frequency changes in both fwd and bwd direction. So what this patch does is change the formula to: time = cycles * frequency + offset And we calculate offset so that time_before == time_after, thereby ridding us of these jumps in time. [ Impact: fix/reduce sched_clock() jumps across frequency changing events ] Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar Chucked-on-by: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/include/asm/timer.h | 6 +++++- arch/x86/kernel/tsc.c | 8 ++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index bd37ed444a21..20ca9c4d4686 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -45,12 +45,16 @@ extern int no_timer_check; */ DECLARE_PER_CPU(unsigned long, cyc2ns); +DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ static inline unsigned long long __cycles_2_ns(unsigned long long cyc) { - return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR; + int cpu = smp_processor_id(); + unsigned long long ns = per_cpu(cyc2ns_offset, cpu); + ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR; + return ns; } static inline unsigned long long cycles_2_ns(unsigned long long cyc) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3e1c057e98fe..ef4dac50143f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -589,22 +589,26 @@ EXPORT_SYMBOL(recalibrate_cpu_khz); */ DEFINE_PER_CPU(unsigned long, cyc2ns); +DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { - unsigned long long tsc_now, ns_now; + unsigned long long tsc_now, ns_now, *offset; unsigned long flags, *scale; local_irq_save(flags); sched_clock_idle_sleep_event(); scale = &per_cpu(cyc2ns, cpu); + offset = &per_cpu(cyc2ns_offset, cpu); rdtscll(tsc_now); ns_now = __cycles_2_ns(tsc_now); - if (cpu_khz) + if (cpu_khz) { *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); + } sched_clock_idle_wakeup_event(0); local_irq_restore(flags); From fd5e1b5dbaa8b4aacc0e251d74182eda37062194 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 15 Jun 2009 13:34:19 +0800 Subject: [PATCH 2/4] sched: Remove unneeded __ref tag Those two functions no longer call alloc_bootmmem_cpumask_var(), so no need to tag them with __init_refok. Signed-off-by: Li Zefan Acked-by: Pekka Enberg LKML-Reference: <4A35DD5B.9050106@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_cpupri.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 8fb88a906aaa..00567959ab17 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7828,7 +7828,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) free_rootdomain(old_rd); } -static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) +static int init_rootdomain(struct root_domain *rd, bool bootmem) { gfp_t gfp = GFP_KERNEL; diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 7deffc9f0e5f..e6c251790dde 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -152,7 +152,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * * Returns: -ENOMEM if memory fails. */ -int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) +int cpupri_init(struct cpupri *cp, bool bootmem) { gfp_t gfp = GFP_KERNEL; int i; From 348ec61e6268c3cd7ee75cfa50e408184a941506 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Wed, 17 Jun 2009 22:20:55 +0900 Subject: [PATCH 3/4] sched: Hide runqueues from direct refer at source code level There are some points which refer the per-cpu value "runqueues" directly. sched.c provides nice abstraction, such as cpu_rq() and this_rq(), so we should use these macros when looking runqueues. Signed-off-by: Hitoshi Mitake LKML-Reference: <20090617.222055.374768827975756908.mitake@dcl.info.waseda.ac.jp> Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 467ca72f1657..70c7e0b79946 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -162,7 +162,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, spread, rq0_min_vruntime, spread0; - struct rq *rq = &per_cpu(runqueues, cpu); + struct rq *rq = cpu_rq(cpu); struct sched_entity *last; unsigned long flags; @@ -191,7 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) if (last) max_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; - rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; + rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; spin_unlock_irqrestore(&rq->lock, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", SPLIT_NS(MIN_vruntime)); @@ -248,7 +248,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) static void print_cpu(struct seq_file *m, int cpu) { - struct rq *rq = &per_cpu(runqueues, cpu); + struct rq *rq = cpu_rq(cpu); #ifdef CONFIG_X86 { From 3104bf03a923c72043a9c5009d9cd56724304916 Mon Sep 17 00:00:00 2001 From: Christian Engelmayer Date: Tue, 16 Jun 2009 10:35:12 +0200 Subject: [PATCH 4/4] sched: Fix out of scope variable access in sched_slice() Access to local variable lw is aliased by usage of pointer load. Access to pointer load in calc_delta_mine() happens when lw is already out of scope. [ Reported by static code analysis. ] Signed-off-by: Christian Engelmayer LKML-Reference: <20090616103512.0c846e51@frequentis.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5f9650e8fe75..ba7fd6e9556f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -430,12 +430,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) for_each_sched_entity(se) { struct load_weight *load; + struct load_weight lw; cfs_rq = cfs_rq_of(se); load = &cfs_rq->load; if (unlikely(!se->on_rq)) { - struct load_weight lw = cfs_rq->load; + lw = cfs_rq->load; update_load_add(&lw, se->load.weight); load = &lw;