From 64c7c8f88559624abdbe12b5da6502e8879f8d28 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 8 Nov 2005 21:39:04 -0800 Subject: [PATCH] [PATCH] sched: resched and cpu_idle rework Make some changes to the NEED_RESCHED and POLLING_NRFLAG to reduce confusion, and make their semantics rigid. Improves efficiency of resched_task and some cpu_idle routines. * In resched_task: - TIF_NEED_RESCHED is only cleared with the task's runqueue lock held, and as we hold it during resched_task, then there is no need for an atomic test and set there. The only other time this should be set is when the task's quantum expires, in the timer interrupt - this is protected against because the rq lock is irq-safe. - If TIF_NEED_RESCHED is set, then we don't need to do anything. It won't get unset until the task get's schedule()d off. - If we are running on the same CPU as the task we resched, then set TIF_NEED_RESCHED and no further action is required. - If we are running on another CPU, and TIF_POLLING_NRFLAG is *not* set after TIF_NEED_RESCHED has been set, then we need to send an IPI. Using these rules, we are able to remove the test and set operation in resched_task, and make clear the previously vague semantics of POLLING_NRFLAG. * In idle routines: - Enter cpu_idle with preempt disabled. When the need_resched() condition becomes true, explicitly call schedule(). This makes things a bit clearer (IMO), but haven't updated all architectures yet. - Many do a test and clear of TIF_NEED_RESCHED for some reason. According to the resched_task rules, this isn't needed (and actually breaks the assumption that TIF_NEED_RESCHED is only cleared with the runqueue lock held). So remove that. Generally one less locked memory op when switching to the idle thread. - Many idle routines clear TIF_POLLING_NRFLAG, and only set it in the inner most polling idle loops. The above resched_task semantics allow it to be set until before the last time need_resched() is checked before going into a halt requiring interrupt wakeup. Many idle routines simply never enter such a halt, and so POLLING_NRFLAG can be always left set, completely eliminating resched IPIs when rescheduling the idle task. POLLING_NRFLAG width can be increased, to reduce the chance of resched IPIs. Signed-off-by: Nick Piggin Cc: Ingo Molnar Cc: Con Kolivas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sched-arch.txt | 89 ++++++++++++++++++++++++++ arch/alpha/kernel/process.c | 10 +-- arch/arm/kernel/process.c | 14 ++-- arch/i386/kernel/apm.c | 20 +++++- arch/i386/kernel/process.c | 64 ++++++++---------- arch/ia64/kernel/process.c | 32 ++++----- arch/parisc/kernel/process.c | 2 + arch/powerpc/platforms/iseries/setup.c | 10 +-- arch/powerpc/platforms/pseries/setup.c | 12 ++-- arch/ppc/kernel/idle.c | 20 +++--- arch/ppc64/kernel/idle.c | 11 +--- arch/s390/kernel/process.c | 15 +++-- arch/sh/kernel/process.c | 12 +--- arch/sh64/kernel/process.c | 14 +--- arch/sparc/kernel/process.c | 35 +++++----- arch/sparc64/kernel/process.c | 20 ++++-- arch/sparc64/kernel/smp.c | 13 +--- arch/x86_64/kernel/process.c | 67 +++++++++---------- drivers/acpi/processor_idle.c | 37 +++++++---- kernel/sched.c | 21 ++++-- 20 files changed, 297 insertions(+), 221 deletions(-) create mode 100644 Documentation/sched-arch.txt diff --git a/Documentation/sched-arch.txt b/Documentation/sched-arch.txt new file mode 100644 index 000000000000..941615a9769b --- /dev/null +++ b/Documentation/sched-arch.txt @@ -0,0 +1,89 @@ + CPU Scheduler implementation hints for architecture specific code + + Nick Piggin, 2005 + +Context switch +============== +1. Runqueue locking +By default, the switch_to arch function is called with the runqueue +locked. This is usually not a problem unless switch_to may need to +take the runqueue lock. This is usually due to a wake up operation in +the context switch. See include/asm-ia64/system.h for an example. + +To request the scheduler call switch_to with the runqueue unlocked, +you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file +(typically the one where switch_to is defined). + +Unlocked context switches introduce only a very minor performance +penalty to the core scheduler implementation in the CONFIG_SMP case. + +2. Interrupt status +By default, the switch_to arch function is called with interrupts +disabled. Interrupts may be enabled over the call if it is likely to +introduce a significant interrupt latency by adding the line +`#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for +unlocked context switches. This define also implies +`__ARCH_WANT_UNLOCKED_CTXSW`. See include/asm-arm/system.h for an +example. + + +CPU idle +======== +Your cpu_idle routines need to obey the following rules: + +1. Preempt should now disabled over idle routines. Should only + be enabled to call schedule() then disabled again. + +2. need_resched/TIF_NEED_RESCHED is only ever set, and will never + be cleared until the running task has called schedule(). Idle + threads need only ever query need_resched, and may never set or + clear it. + +3. When cpu_idle finds (need_resched() == 'true'), it should call + schedule(). It should not call schedule() otherwise. + +4. The only time interrupts need to be disabled when checking + need_resched is if we are about to sleep the processor until + the next interrupt (this doesn't provide any protection of + need_resched, it prevents losing an interrupt). + + 4a. Common problem with this type of sleep appears to be: + local_irq_disable(); + if (!need_resched()) { + local_irq_enable(); + *** resched interrupt arrives here *** + __asm__("sleep until next interrupt"); + } + +5. TIF_POLLING_NRFLAG can be set by idle routines that do not + need an interrupt to wake them up when need_resched goes high. + In other words, they must be periodically polling need_resched, + although it may be reasonable to do some background work or enter + a low CPU priority. + + 5a. If TIF_POLLING_NRFLAG is set, and we do decide to enter + an interrupt sleep, it needs to be cleared then a memory + barrier issued (followed by a test of need_resched with + interrupts disabled, as explained in 3). + +arch/i386/kernel/process.c has examples of both polling and +sleeping idle functions. + + +Possible arch/ problems +======================= + +Possible arch problems I found (and either tried to fix or didn't): + +h8300 - Is such sleeping racy vs interrupts? (See #4a). + The H8/300 manual I found indicates yes, however disabling IRQs + over the sleep mean only NMIs can wake it up, so can't fix easily + without doing spin waiting. + +ia64 - is safe_halt call racy vs interrupts? (does it sleep?) (See #4a) + +sh64 - Is sleeping racy vs interrupts? (See #4a) + +sparc - IRQs on at this point(?), change local_irq_save to _disable. + - TODO: needs secondary CPUs to disable preempt (See #1) + diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index eb20c3afff58..a8682612abc0 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -43,21 +43,17 @@ #include "proto.h" #include "pci_impl.h" -void default_idle(void) -{ - barrier(); -} - void cpu_idle(void) { + set_thread_flag(TIF_POLLING_NRFLAG); + while (1) { - void (*idle)(void) = default_idle; /* FIXME -- EV6 and LCA45 know how to power down the CPU. */ while (!need_resched()) - idle(); + cpu_relax(); schedule(); } } diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 93dd92cc12f8..c0f6a119de3b 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -86,12 +86,16 @@ EXPORT_SYMBOL(pm_power_off); */ void default_idle(void) { - local_irq_disable(); - if (!need_resched() && !hlt_counter) { - timer_dyn_reprogram(); - arch_idle(); + if (hlt_counter) + cpu_relax(); + else { + local_irq_disable(); + if (!need_resched()) { + timer_dyn_reprogram(); + arch_idle(); + } + local_irq_enable(); } - local_irq_enable(); } /* diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 86e80c551478..003548b8735f 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -769,8 +769,26 @@ static int set_system_power_state(u_short state) static int apm_do_idle(void) { u32 eax; + u8 ret = 0; + int idled = 0; + int polling; - if (apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax)) { + polling = test_thread_flag(TIF_POLLING_NRFLAG); + if (polling) { + clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb__after_clear_bit(); + } + if (!need_resched()) { + idled = 1; + ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax); + } + if (polling) + set_thread_flag(TIF_POLLING_NRFLAG); + + if (!idled) + return 0; + + if (ret) { static unsigned long t; /* This always fails on some SMP boards running UP kernels. diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 5296e284ea36..1cb261f225d5 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -99,14 +99,22 @@ EXPORT_SYMBOL(enable_hlt); */ void default_idle(void) { + local_irq_enable(); + if (!hlt_counter && boot_cpu_data.hlt_works_ok) { - local_irq_disable(); - if (!need_resched()) - safe_halt(); - else - local_irq_enable(); + clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb__after_clear_bit(); + while (!need_resched()) { + local_irq_disable(); + if (!need_resched()) + safe_halt(); + else + local_irq_enable(); + } + set_thread_flag(TIF_POLLING_NRFLAG); } else { - cpu_relax(); + while (!need_resched()) + cpu_relax(); } } #ifdef CONFIG_APM_MODULE @@ -120,29 +128,14 @@ EXPORT_SYMBOL(default_idle); */ static void poll_idle (void) { - int oldval; - local_irq_enable(); - /* - * Deal with another CPU just having chosen a thread to - * run here: - */ - oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); - - if (!oldval) { - set_thread_flag(TIF_POLLING_NRFLAG); - asm volatile( - "2:" - "testl %0, %1;" - "rep; nop;" - "je 2b;" - : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); - - clear_thread_flag(TIF_POLLING_NRFLAG); - } else { - set_need_resched(); - } + asm volatile( + "2:" + "testl %0, %1;" + "rep; nop;" + "je 2b;" + : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); } #ifdef CONFIG_HOTPLUG_CPU @@ -181,6 +174,8 @@ void cpu_idle(void) { int cpu = smp_processor_id(); + set_thread_flag(TIF_POLLING_NRFLAG); + /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) { @@ -246,15 +241,12 @@ static void mwait_idle(void) { local_irq_enable(); - if (!need_resched()) { - set_thread_flag(TIF_POLLING_NRFLAG); - do { - __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (need_resched()) - break; - __mwait(0, 0); - } while (!need_resched()); - clear_thread_flag(TIF_POLLING_NRFLAG); + while (!need_resched()) { + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (need_resched()) + break; + __mwait(0, 0); } } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 4c621fc3c3b9..640d6908f8ec 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -197,11 +197,15 @@ void default_idle (void) { local_irq_enable(); - while (!need_resched()) - if (can_do_pal_halt) - safe_halt(); - else + while (!need_resched()) { + if (can_do_pal_halt) { + local_irq_disable(); + if (!need_resched()) + safe_halt(); + local_irq_enable(); + } else cpu_relax(); + } } #ifdef CONFIG_HOTPLUG_CPU @@ -263,16 +267,16 @@ void __attribute__((noreturn)) cpu_idle (void) { void (*mark_idle)(int) = ia64_mark_idle; + int cpu = smp_processor_id(); + set_thread_flag(TIF_POLLING_NRFLAG); /* endless idle loop with no priority at all */ while (1) { + if (!need_resched()) { + void (*idle)(void); #ifdef CONFIG_SMP - if (!need_resched()) min_xtp(); #endif - while (!need_resched()) { - void (*idle)(void); - if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; @@ -284,19 +288,17 @@ cpu_idle (void) if (!idle) idle = default_idle; (*idle)(); - } - - if (mark_idle) - (*mark_idle)(0); - + if (mark_idle) + (*mark_idle)(0); #ifdef CONFIG_SMP - normal_xtp(); + normal_xtp(); #endif + } preempt_enable_no_resched(); schedule(); preempt_disable(); check_pgt_cache(); - if (cpu_is_offline(smp_processor_id())) + if (cpu_is_offline(cpu)) play_dead(); } } diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index f482f78de435..fee4f1f09adc 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -88,6 +88,8 @@ void default_idle(void) */ void cpu_idle(void) { + set_thread_flag(TIF_POLLING_NRFLAG); + /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index 0130f2619dac..7f8f0cda6a74 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -703,13 +703,10 @@ static void iseries_shared_idle(void) static void iseries_dedicated_idle(void) { long oldval; + set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); - - if (!oldval) { - set_thread_flag(TIF_POLLING_NRFLAG); - + if (!need_resched()) { while (!need_resched()) { ppc64_runlatch_off(); HMT_low(); @@ -722,9 +719,6 @@ static void iseries_dedicated_idle(void) } HMT_medium(); - clear_thread_flag(TIF_POLLING_NRFLAG); - } else { - set_need_resched(); } ppc64_runlatch_on(); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 4854f5eb5c3d..a093a0d4dd69 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -469,6 +469,7 @@ static inline void dedicated_idle_sleep(unsigned int cpu) * more. */ clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb__after_clear_bit(); /* * SMT dynamic mode. Cede will result in this thread going @@ -481,6 +482,7 @@ static inline void dedicated_idle_sleep(unsigned int cpu) cede_processor(); else local_irq_enable(); + set_thread_flag(TIF_POLLING_NRFLAG); } else { /* * Give the HV an opportunity at the processor, since we are @@ -492,11 +494,11 @@ static inline void dedicated_idle_sleep(unsigned int cpu) static void pseries_dedicated_idle(void) { - long oldval; struct paca_struct *lpaca = get_paca(); unsigned int cpu = smp_processor_id(); unsigned long start_snooze; unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay); + set_thread_flag(TIF_POLLING_NRFLAG); while (1) { /* @@ -505,10 +507,7 @@ static void pseries_dedicated_idle(void) */ lpaca->lppaca.idle = 1; - oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); - if (!oldval) { - set_thread_flag(TIF_POLLING_NRFLAG); - + if (!need_resched()) { start_snooze = __get_tb() + *smt_snooze_delay * tb_ticks_per_usec; @@ -531,9 +530,6 @@ static void pseries_dedicated_idle(void) } HMT_medium(); - clear_thread_flag(TIF_POLLING_NRFLAG); - } else { - set_need_resched(); } lpaca->lppaca.idle = 0; diff --git a/arch/ppc/kernel/idle.c b/arch/ppc/kernel/idle.c index a6141f05c919..3c4e4cb61074 100644 --- a/arch/ppc/kernel/idle.c +++ b/arch/ppc/kernel/idle.c @@ -63,18 +63,18 @@ void cpu_idle(void) int cpu = smp_processor_id(); for (;;) { - if (ppc_md.idle != NULL) - ppc_md.idle(); - else - default_idle(); - if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING) - cpu_die(); - if (need_resched()) { - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + while (need_resched()) { + if (ppc_md.idle != NULL) + ppc_md.idle(); + else + default_idle(); } + if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING) + cpu_die(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/arch/ppc64/kernel/idle.c b/arch/ppc64/kernel/idle.c index 909ea669af91..715bc0e71e0f 100644 --- a/arch/ppc64/kernel/idle.c +++ b/arch/ppc64/kernel/idle.c @@ -34,15 +34,11 @@ extern void power4_idle(void); void default_idle(void) { - long oldval; unsigned int cpu = smp_processor_id(); + set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); - - if (!oldval) { - set_thread_flag(TIF_POLLING_NRFLAG); - + if (!need_resched()) { while (!need_resched() && !cpu_is_offline(cpu)) { ppc64_runlatch_off(); @@ -55,9 +51,6 @@ void default_idle(void) } HMT_medium(); - clear_thread_flag(TIF_POLLING_NRFLAG); - } else { - set_need_resched(); } ppc64_runlatch_on(); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 66ca5757e368..78b64fe5e7c2 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -99,14 +99,15 @@ void default_idle(void) { int cpu, rc; - local_irq_disable(); - if (need_resched()) { - local_irq_enable(); - return; - } - /* CPU is going idle. */ cpu = smp_processor_id(); + + local_irq_disable(); + if (need_resched()) { + local_irq_enable(); + return; + } + rc = notifier_call_chain(&idle_chain, CPU_IDLE, (void *)(long) cpu); if (rc != NOTIFY_OK && rc != NOTIFY_DONE) BUG(); @@ -119,7 +120,7 @@ void default_idle(void) __ctl_set_bit(8, 15); #ifdef CONFIG_HOTPLUG_CPU - if (cpu_is_offline(smp_processor_id())) + if (cpu_is_offline(cpu)) cpu_die(); #endif diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c index 1cbc26b796ad..fd4f240b833d 100644 --- a/arch/sh/kernel/process.c +++ b/arch/sh/kernel/process.c @@ -51,14 +51,13 @@ void enable_hlt(void) EXPORT_SYMBOL(enable_hlt); -void default_idle(void) +void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { if (hlt_counter) { - while (1) - if (need_resched()) - break; + while (!need_resched()) + cpu_relax(); } else { while (!need_resched()) cpu_sleep(); @@ -70,11 +69,6 @@ void default_idle(void) } } -void cpu_idle(void) -{ - default_idle(); -} - void machine_restart(char * __unused) { /* SR.BL=1 and invoke address error to let CPU reset (manual reset) */ diff --git a/arch/sh64/kernel/process.c b/arch/sh64/kernel/process.c index 0c09537449b3..b95d04141855 100644 --- a/arch/sh64/kernel/process.c +++ b/arch/sh64/kernel/process.c @@ -307,23 +307,19 @@ __setup("hlt", hlt_setup); static inline void hlt(void) { - if (hlt_counter) - return; - __asm__ __volatile__ ("sleep" : : : "memory"); } /* * The idle loop on a uniprocessor SH.. */ -void default_idle(void) +void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { if (hlt_counter) { - while (1) - if (need_resched()) - break; + while (!need_resched()) + cpu_relax(); } else { local_irq_disable(); while (!need_resched()) { @@ -338,11 +334,7 @@ void default_idle(void) schedule(); preempt_disable(); } -} -void cpu_idle(void) -{ - default_idle(); } void machine_restart(char * __unused) diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c index c39f4d01096d..ea8647411462 100644 --- a/arch/sparc/kernel/process.c +++ b/arch/sparc/kernel/process.c @@ -67,13 +67,6 @@ extern void fpsave(unsigned long *, unsigned long *, void *, unsigned long *); struct task_struct *last_task_used_math = NULL; struct thread_info *current_set[NR_CPUS]; -/* - * default_idle is new in 2.5. XXX Review, currently stolen from sparc64. - */ -void default_idle(void) -{ -} - #ifndef CONFIG_SMP #define SUN4C_FAULT_HIGH 100 @@ -92,12 +85,11 @@ void cpu_idle(void) static unsigned long fps; unsigned long now; unsigned long faults; - unsigned long flags; extern unsigned long sun4c_kernel_faults; extern void sun4c_grow_kernel_ring(void); - local_irq_save(flags); + local_irq_disable(); now = jiffies; count -= (now - last_jiffies); last_jiffies = now; @@ -113,13 +105,16 @@ void cpu_idle(void) sun4c_grow_kernel_ring(); } } - local_irq_restore(flags); + local_irq_enable(); } - while((!need_resched()) && pm_idle) { - (*pm_idle)(); + if (pm_idle) { + while (!need_resched()) + (*pm_idle)(); + } else { + while (!need_resched()) + cpu_relax(); } - preempt_enable_no_resched(); schedule(); preempt_disable(); @@ -132,15 +127,15 @@ void cpu_idle(void) /* This is being executed in task 0 'user space'. */ void cpu_idle(void) { + set_thread_flag(TIF_POLLING_NRFLAG); /* endless idle loop with no priority at all */ while(1) { - if(need_resched()) { - preempt_enable_no_resched(); - schedule(); - preempt_disable(); - check_pgt_cache(); - } - barrier(); /* or else gcc optimizes... */ + while (!need_resched()) + cpu_relax(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + check_pgt_cache(); } } diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c index 2f89206e008f..02f9dec1d459 100644 --- a/arch/sparc64/kernel/process.c +++ b/arch/sparc64/kernel/process.c @@ -85,23 +85,31 @@ void cpu_idle(void) /* * the idle loop on a UltraMultiPenguin... + * + * TIF_POLLING_NRFLAG is set because we do not sleep the cpu + * inside of the idler task, so an interrupt is not needed + * to get a clean fast response. + * + * XXX Reverify this assumption... -DaveM + * + * Addendum: We do want it to do something for the signal + * delivery case, we detect that by just seeing + * if we are trying to send this to an idler or not. */ -#define idle_me_harder() (cpu_data(smp_processor_id()).idle_volume += 1) -#define unidle_me() (cpu_data(smp_processor_id()).idle_volume = 0) void cpu_idle(void) { + cpuinfo_sparc *cpuinfo = &local_cpu_data(); set_thread_flag(TIF_POLLING_NRFLAG); + while(1) { if (need_resched()) { - unidle_me(); - clear_thread_flag(TIF_POLLING_NRFLAG); + cpuinfo->idle_volume = 0; preempt_enable_no_resched(); schedule(); preempt_disable(); - set_thread_flag(TIF_POLLING_NRFLAG); check_pgt_cache(); } - idle_me_harder(); + cpuinfo->idle_volume++; /* The store ordering is so that IRQ handlers on * other cpus see our increasing idleness for the buddy diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index 8aca4b1dc04e..797a65493fb8 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -1152,20 +1152,9 @@ void __init smp_cpus_done(unsigned int max_cpus) (bogosum/(5000/HZ))%100); } -/* This needn't do anything as we do not sleep the cpu - * inside of the idler task, so an interrupt is not needed - * to get a clean fast response. - * - * XXX Reverify this assumption... -DaveM - * - * Addendum: We do want it to do something for the signal - * delivery case, we detect that by just seeing - * if we are trying to send this to an idler or not. - */ void smp_send_reschedule(int cpu) { - if (cpu_data(cpu).idle_volume == 0) - smp_receive_signal(cpu); + smp_receive_signal(cpu); } /* This is a nop because we capture all other cpus diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 571f9fe490ce..59be85d9a4bc 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -86,12 +86,22 @@ EXPORT_SYMBOL(enable_hlt); */ void default_idle(void) { + local_irq_enable(); + if (!atomic_read(&hlt_counter)) { - local_irq_disable(); - if (!need_resched()) - safe_halt(); - else - local_irq_enable(); + clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb__after_clear_bit(); + while (!need_resched()) { + local_irq_disable(); + if (!need_resched()) + safe_halt(); + else + local_irq_enable(); + } + set_thread_flag(TIF_POLLING_NRFLAG); + } else { + while (!need_resched()) + cpu_relax(); } } @@ -102,30 +112,16 @@ void default_idle(void) */ static void poll_idle (void) { - int oldval; - local_irq_enable(); - /* - * Deal with another CPU just having chosen a thread to - * run here: - */ - oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); - - if (!oldval) { - set_thread_flag(TIF_POLLING_NRFLAG); - asm volatile( - "2:" - "testl %0,%1;" - "rep; nop;" - "je 2b;" - : : - "i" (_TIF_NEED_RESCHED), - "m" (current_thread_info()->flags)); - clear_thread_flag(TIF_POLLING_NRFLAG); - } else { - set_need_resched(); - } + asm volatile( + "2:" + "testl %0,%1;" + "rep; nop;" + "je 2b;" + : : + "i" (_TIF_NEED_RESCHED), + "m" (current_thread_info()->flags)); } void cpu_idle_wait(void) @@ -187,6 +183,8 @@ static inline void play_dead(void) */ void cpu_idle (void) { + set_thread_flag(TIF_POLLING_NRFLAG); + /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) { @@ -221,15 +219,12 @@ static void mwait_idle(void) { local_irq_enable(); - if (!need_resched()) { - set_thread_flag(TIF_POLLING_NRFLAG); - do { - __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (need_resched()) - break; - __mwait(0, 0); - } while (!need_resched()); - clear_thread_flag(TIF_POLLING_NRFLAG); + while (!need_resched()) { + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (need_resched()) + break; + __mwait(0, 0); } } diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 161db4acfb91..573b6a97bb1f 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -167,6 +167,19 @@ acpi_processor_power_activate(struct acpi_processor *pr, return; } +static void acpi_safe_halt(void) +{ + int polling = test_thread_flag(TIF_POLLING_NRFLAG); + if (polling) { + clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb__after_clear_bit(); + } + if (!need_resched()) + safe_halt(); + if (polling) + set_thread_flag(TIF_POLLING_NRFLAG); +} + static atomic_t c3_cpu_count; static void acpi_processor_idle(void) @@ -177,7 +190,7 @@ static void acpi_processor_idle(void) int sleep_ticks = 0; u32 t1, t2 = 0; - pr = processors[raw_smp_processor_id()]; + pr = processors[smp_processor_id()]; if (!pr) return; @@ -197,8 +210,13 @@ static void acpi_processor_idle(void) } cx = pr->power.state; - if (!cx) - goto easy_out; + if (!cx) { + if (pm_idle_save) + pm_idle_save(); + else + acpi_safe_halt(); + return; + } /* * Check BM Activity @@ -278,7 +296,8 @@ static void acpi_processor_idle(void) if (pm_idle_save) pm_idle_save(); else - safe_halt(); + acpi_safe_halt(); + /* * TBD: Can't get time duration while in C1, as resumes * go to an ISR rather than here. Need to instrument @@ -414,16 +433,6 @@ static void acpi_processor_idle(void) */ if (next_state != pr->power.state) acpi_processor_power_activate(pr, next_state); - - return; - - easy_out: - /* do C1 instead of busy loop */ - if (pm_idle_save) - pm_idle_save(); - else - safe_halt(); - return; } static int acpi_processor_set_power_policy(struct acpi_processor *pr) diff --git a/kernel/sched.c b/kernel/sched.c index 0f2def822296..ac3f5cc3bb51 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -864,21 +864,28 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq) #ifdef CONFIG_SMP static void resched_task(task_t *p) { - int need_resched, nrpolling; + int cpu; assert_spin_locked(&task_rq(p)->lock); - /* minimise the chance of sending an interrupt to poll_idle() */ - nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); - need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); - nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); + if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + return; - if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) - smp_send_reschedule(task_cpu(p)); + set_tsk_thread_flag(p, TIF_NEED_RESCHED); + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ + smp_mb(); + if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) + smp_send_reschedule(cpu); } #else static inline void resched_task(task_t *p) { + assert_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } #endif