From 58154c8ce71a7854d969d73468fd00e5eeeab708 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 30 Nov 2011 00:23:09 +0000 Subject: [PATCH 1/9] powerpc: Give us time to get all oopses out before panicking I've been seeing truncated output when people send system reset info to me. We should see a backtrace for every CPU, but the panic() code takes the box down before they all make it out to the console. The panic code runs unlocked so we also see corrupted console output. If we are going to panic, then delay 1 second before calling into the panic code. Move oops_exit inside the die lock and put a newline between oopses for clarity. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/traps.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 5459d148a0f6..91377870b26a 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -158,6 +158,8 @@ int die(const char *str, struct pt_regs *regs, long err) bust_spinlocks(0); die.lock_owner = -1; add_taint(TAINT_DIE); + oops_exit(); + printk("\n"); raw_spin_unlock_irqrestore(&die.lock, flags); if (kexec_should_crash(current) || @@ -165,13 +167,23 @@ int die(const char *str, struct pt_regs *regs, long err) crash_kexec(regs); crash_kexec_secondary(regs); + /* + * While our oops output is serialised by a spinlock, output + * from panic() called below can race and corrupt it. If we + * know we are going to panic, delay for 1 second so we have a + * chance to get clean backtraces from all CPUs that are oopsing. + */ + if (in_interrupt() || panic_on_oops || !current->pid || + is_global_init(current)) { + mdelay(MSEC_PER_SEC); + } + if (in_interrupt()) panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); - oops_exit(); do_exit(err); return 0; From 9b00ac06978c54788f13eefd34a07b77db48d567 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 30 Nov 2011 00:23:10 +0000 Subject: [PATCH 2/9] powerpc: Remove broken and complicated kdump system reset code We have a lot of complicated logic that handles possible recursion between kdump and a system reset exception. We can solve this in a much simpler way using the same setjmp/longjmp tricks xmon does. As a first step, this patch removes the old system reset code. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/kexec.h | 6 --- arch/powerpc/kernel/crash.c | 87 +++++--------------------------- arch/powerpc/kernel/traps.c | 33 +++++------- 3 files changed, 25 insertions(+), 101 deletions(-) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index f921eb121d39..209ed8ba1637 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -73,11 +73,6 @@ extern void kexec_smp_wait(void); /* get and clear naca physid, wait for master to copy new code to 0 */ extern int crashing_cpu; extern void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)); -extern cpumask_t cpus_in_sr; -static inline int kexec_sr_activated(int cpu) -{ - return cpumask_test_cpu(cpu, &cpus_in_sr); -} struct kimage; struct pt_regs; @@ -94,7 +89,6 @@ extern void reserve_crashkernel(void); extern void machine_kexec_mask_interrupts(void); #else /* !CONFIG_KEXEC */ -static inline int kexec_sr_activated(int cpu) { return 0; } static inline void crash_kexec_secondary(struct pt_regs *regs) { } static inline int overlaps_crashkernel(unsigned long start, unsigned long size) diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index d879809d5c45..3d87b205d5f5 100644 --- a/arch/powerpc/kernel/crash.c +++ b/arch/powerpc/kernel/crash.c @@ -47,7 +47,6 @@ /* This keeps a track of which one is crashing cpu. */ int crashing_cpu = -1; static cpumask_t cpus_in_crash = CPU_MASK_NONE; -cpumask_t cpus_in_sr = CPU_MASK_NONE; #define CRASH_HANDLER_MAX 3 /* NULL terminated list of shutdown handles */ @@ -55,7 +54,6 @@ static crash_shutdown_t crash_shutdown_handles[CRASH_HANDLER_MAX+1]; static DEFINE_SPINLOCK(crash_handlers_lock); #ifdef CONFIG_SMP -static atomic_t enter_on_soft_reset = ATOMIC_INIT(0); void crash_ipi_callback(struct pt_regs *regs) { @@ -69,24 +67,9 @@ void crash_ipi_callback(struct pt_regs *regs) crash_save_cpu(regs, cpu); cpumask_set_cpu(cpu, &cpus_in_crash); - /* - * Entered via soft-reset - could be the kdump - * process is invoked using soft-reset or user activated - * it if some CPU did not respond to an IPI. - * For soft-reset, the secondary CPU can enter this func - * twice. 1 - using IPI, and 2. soft-reset. - * Tell the kexec CPU that entered via soft-reset and ready - * to go down. - */ - if (cpumask_test_cpu(cpu, &cpus_in_sr)) { - cpumask_clear_cpu(cpu, &cpus_in_sr); - atomic_inc(&enter_on_soft_reset); - } - /* * Starting the kdump boot. * This barrier is needed to make sure that all CPUs are stopped. - * If not, soft-reset will be invoked to bring other CPUs. */ while (!cpumask_test_cpu(crashing_cpu, &cpus_in_crash)) cpu_relax(); @@ -103,25 +86,14 @@ void crash_ipi_callback(struct pt_regs *regs) /* NOTREACHED */ } -/* - * Wait until all CPUs are entered via soft-reset. - */ -static void crash_soft_reset_check(int cpu) -{ - unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */ - - cpumask_clear_cpu(cpu, &cpus_in_sr); - while (atomic_read(&enter_on_soft_reset) != ncpus) - cpu_relax(); -} - - static void crash_kexec_prepare_cpus(int cpu) { unsigned int msecs; unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */ + printk(KERN_EMERG "Sending IPI to other CPUs\n"); + crash_send_ipi(crash_ipi_callback); smp_wmb(); @@ -131,7 +103,6 @@ static void crash_kexec_prepare_cpus(int cpu) * respond. * Delay of at least 10 seconds. */ - printk(KERN_EMERG "Sending IPI to other cpus...\n"); msecs = 10000; while ((cpumask_weight(&cpus_in_crash) < ncpus) && (--msecs > 0)) { cpu_relax(); @@ -140,69 +111,36 @@ static void crash_kexec_prepare_cpus(int cpu) /* Would it be better to replace the trap vector here? */ - /* - * FIXME: In case if we do not get all CPUs, one possibility: ask the - * user to do soft reset such that we get all. - * Soft-reset will be used until better mechanism is implemented. - */ if (cpumask_weight(&cpus_in_crash) < ncpus) { - printk(KERN_EMERG "done waiting: %d cpu(s) not responding\n", + printk(KERN_EMERG "ERROR: %d CPU(s) not responding\n", ncpus - cpumask_weight(&cpus_in_crash)); - printk(KERN_EMERG "Activate soft-reset to stop other cpu(s)\n"); - cpumask_clear(&cpus_in_sr); - atomic_set(&enter_on_soft_reset, 0); - while (cpumask_weight(&cpus_in_crash) < ncpus) - cpu_relax(); } - /* - * Make sure all CPUs are entered via soft-reset if the kdump is - * invoked using soft-reset. - */ - if (cpumask_test_cpu(cpu, &cpus_in_sr)) - crash_soft_reset_check(cpu); - /* Leave the IPI callback set */ + + printk(KERN_EMERG "IPI complete\n"); } /* - * This function will be called by secondary cpus or by kexec cpu - * if soft-reset is activated to stop some CPUs. + * This function will be called by secondary cpus. */ void crash_kexec_secondary(struct pt_regs *regs) { - int cpu = smp_processor_id(); unsigned long flags; - int msecs = 5; + int msecs = 500; local_irq_save(flags); - /* Wait 5ms if the kexec CPU is not entered yet. */ + + /* Wait 500ms for the primary crash CPU to signal its progress */ while (crashing_cpu < 0) { if (--msecs < 0) { - /* - * Either kdump image is not loaded or - * kdump process is not started - Probably xmon - * exited using 'x'(exit and recover) or - * kexec_should_crash() failed for all running tasks. - */ - cpumask_clear_cpu(cpu, &cpus_in_sr); + /* No response, kdump image may not have been loaded */ local_irq_restore(flags); return; } + mdelay(1); cpu_relax(); } - if (cpu == crashing_cpu) { - /* - * Panic CPU will enter this func only via soft-reset. - * Wait until all secondary CPUs entered and - * then start kexec boot. - */ - crash_soft_reset_check(cpu); - cpumask_set_cpu(crashing_cpu, &cpus_in_crash); - if (ppc_md.kexec_cpu_down) - ppc_md.kexec_cpu_down(1, 0); - machine_kexec(kexec_crash_image); - /* NOTREACHED */ - } + crash_ipi_callback(regs); } @@ -225,7 +163,6 @@ static void crash_kexec_prepare_cpus(int cpu) void crash_kexec_secondary(struct pt_regs *regs) { - cpumask_clear(&cpus_in_sr); } #endif /* CONFIG_SMP */ diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 91377870b26a..014f88f03d3f 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -162,10 +162,20 @@ int die(const char *str, struct pt_regs *regs, long err) printk("\n"); raw_spin_unlock_irqrestore(&die.lock, flags); - if (kexec_should_crash(current) || - kexec_sr_activated(smp_processor_id())) + /* + * A system reset (0x100) is a request to dump, so we always send + * it through the crashdump code. + */ + if (kexec_should_crash(current) || (TRAP(regs) == 0x100)) { crash_kexec(regs); - crash_kexec_secondary(regs); + + /* + * We aren't the primary crash CPU. We need to send it + * to a holding pattern to avoid it ending up in the panic + * code. + */ + crash_kexec_secondary(regs); + } /* * While our oops output is serialised by a spinlock, output @@ -232,25 +242,8 @@ void system_reset_exception(struct pt_regs *regs) return; } -#ifdef CONFIG_KEXEC - cpumask_set_cpu(smp_processor_id(), &cpus_in_sr); -#endif - die("System Reset", regs, SIGABRT); - /* - * Some CPUs when released from the debugger will execute this path. - * These CPUs entered the debugger via a soft-reset. If the CPU was - * hung before entering the debugger it will return to the hung - * state when exiting this function. This causes a problem in - * kdump since the hung CPU(s) will not respond to the IPI sent - * from kdump. To prevent the problem we call crash_kexec_secondary() - * here. If a kdump had not been initiated or we exit the debugger - * with the "exit and recover" command (x) crash_kexec_secondary() - * will return after 5ms and the CPU returns to its previous state. - */ - crash_kexec_secondary(regs); - /* Must die if the interrupt is not recoverable */ if (!(regs->msr & MSR_RI)) panic("Unrecoverable System Reset"); From 07fe0c6132578186773e01ffb0f63ded222effe7 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 30 Nov 2011 00:23:11 +0000 Subject: [PATCH 3/9] powerpc/kdump: Use setjmp/longjmp to handle kdump and system reset recursion We can handle recursion caused by system reset by reusing the crash shutdown fault handler. Since we don't have an OS triggerable NMI, if all CPUs don't make it into kdump then we tell the user to issue a system reset. However if we have a panic timeout set we cannot wait forever and must continue the kdump. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/crash.c | 72 +++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index 3d87b205d5f5..a8b6e2d705a4 100644 --- a/arch/powerpc/kernel/crash.c +++ b/arch/powerpc/kernel/crash.c @@ -53,6 +53,16 @@ static cpumask_t cpus_in_crash = CPU_MASK_NONE; static crash_shutdown_t crash_shutdown_handles[CRASH_HANDLER_MAX+1]; static DEFINE_SPINLOCK(crash_handlers_lock); +static unsigned long crash_shutdown_buf[JMP_BUF_LEN]; +static int crash_shutdown_cpu = -1; + +static int handle_fault(struct pt_regs *regs) +{ + if (crash_shutdown_cpu == smp_processor_id()) + longjmp(crash_shutdown_buf, 1); + return 0; +} + #ifdef CONFIG_SMP void crash_ipi_callback(struct pt_regs *regs) @@ -89,14 +99,16 @@ void crash_ipi_callback(struct pt_regs *regs) static void crash_kexec_prepare_cpus(int cpu) { unsigned int msecs; - unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */ + int tries = 0; + int (*old_handler)(struct pt_regs *regs); printk(KERN_EMERG "Sending IPI to other CPUs\n"); crash_send_ipi(crash_ipi_callback); smp_wmb(); +again: /* * FIXME: Until we will have the way to stop other CPUs reliably, * the crash CPU will send an IPI and wait for other CPUs to @@ -111,12 +123,52 @@ static void crash_kexec_prepare_cpus(int cpu) /* Would it be better to replace the trap vector here? */ - if (cpumask_weight(&cpus_in_crash) < ncpus) { - printk(KERN_EMERG "ERROR: %d CPU(s) not responding\n", - ncpus - cpumask_weight(&cpus_in_crash)); + if (cpumask_weight(&cpus_in_crash) >= ncpus) { + printk(KERN_EMERG "IPI complete\n"); + return; } - printk(KERN_EMERG "IPI complete\n"); + printk(KERN_EMERG "ERROR: %d cpu(s) not responding\n", + ncpus - cpumask_weight(&cpus_in_crash)); + + /* + * If we have a panic timeout set then we can't wait indefinitely + * for someone to activate system reset. We also give up on the + * second time through if system reset fail to work. + */ + if ((panic_timeout > 0) || (tries > 0)) + return; + + /* + * A system reset will cause all CPUs to take an 0x100 exception. + * The primary CPU returns here via setjmp, and the secondary + * CPUs reexecute the crash_kexec_secondary path. + */ + old_handler = __debugger; + __debugger = handle_fault; + crash_shutdown_cpu = smp_processor_id(); + + if (setjmp(crash_shutdown_buf) == 0) { + printk(KERN_EMERG "Activate system reset (dumprestart) " + "to stop other cpu(s)\n"); + + /* + * A system reset will force all CPUs to execute the + * crash code again. We need to reset cpus_in_crash so we + * wait for everyone to do this. + */ + cpus_in_crash = CPU_MASK_NONE; + smp_mb(); + + while (cpumask_weight(&cpus_in_crash) < ncpus) + cpu_relax(); + } + + crash_shutdown_cpu = -1; + __debugger = old_handler; + + tries++; + goto again; } /* @@ -245,16 +297,6 @@ int crash_shutdown_unregister(crash_shutdown_t handler) } EXPORT_SYMBOL(crash_shutdown_unregister); -static unsigned long crash_shutdown_buf[JMP_BUF_LEN]; -static int crash_shutdown_cpu = -1; - -static int handle_fault(struct pt_regs *regs) -{ - if (crash_shutdown_cpu == smp_processor_id()) - longjmp(crash_shutdown_buf, 1); - return 0; -} - void default_machine_crash_shutdown(struct pt_regs *regs) { unsigned int i; From 8c27474a252e84e8a71ae4a43e18f0193a08e3f7 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 30 Nov 2011 00:23:12 +0000 Subject: [PATCH 4/9] powerpc: Cleanup crash/kexec code Remove some unnecessary defines and fix some spelling mistakes. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/kexec.h | 1 - arch/powerpc/kernel/crash.c | 21 +++------------------ 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 209ed8ba1637..16d7e33d35e9 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -49,7 +49,6 @@ #define KEXEC_STATE_REAL_MODE 2 #ifndef __ASSEMBLY__ -#include #include typedef void (*crash_shutdown_t)(void); diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index a8b6e2d705a4..d4467557b00e 100644 --- a/arch/powerpc/kernel/crash.c +++ b/arch/powerpc/kernel/crash.c @@ -10,41 +10,27 @@ * */ -#undef DEBUG - #include #include #include #include -#include #include #include #include -#include -#include #include #include #include -#include #include #include #include #include #include -#include #include #include #include -#ifdef DEBUG -#include -#define DBG(fmt...) udbg_printf(fmt) -#else -#define DBG(fmt...) -#endif - -/* This keeps a track of which one is crashing cpu. */ +/* This keeps a track of which one is the crashing cpu. */ int crashing_cpu = -1; static cpumask_t cpus_in_crash = CPU_MASK_NONE; @@ -201,7 +187,7 @@ void crash_kexec_secondary(struct pt_regs *regs) static void crash_kexec_prepare_cpus(int cpu) { /* - * move the secondarys to us so that we can copy + * move the secondaries to us so that we can copy * the new kernel 0-0x100 safely * * do this if kexec in setup.c ? @@ -302,7 +288,6 @@ void default_machine_crash_shutdown(struct pt_regs *regs) unsigned int i; int (*old_handler)(struct pt_regs *regs); - /* * This function is only called after the system * has panicked or is otherwise in a critical state. @@ -328,7 +313,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs) machine_kexec_mask_interrupts(); /* - * Call registered shutdown routines savely. Swap out + * Call registered shutdown routines safely. Swap out * __debugger_fault_handler, and replace on exit. */ old_handler = __debugger_fault_handler; From 760ca4dc90e624eb8f7ff85a5925151e25577758 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 30 Nov 2011 00:23:13 +0000 Subject: [PATCH 5/9] powerpc: Rework die() Our die() code was based off a very old x86 version. Update it to mirror the current x86 code. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/system.h | 2 +- arch/powerpc/kernel/traps.c | 128 +++++++++++++++++------------- 2 files changed, 74 insertions(+), 56 deletions(-) diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h index e30a13d1ee76..d51c2c20dd23 100644 --- a/arch/powerpc/include/asm/system.h +++ b/arch/powerpc/include/asm/system.h @@ -193,8 +193,8 @@ extern void cacheable_memzero(void *p, unsigned int nb); extern void *cacheable_memcpy(void *, const void *, unsigned int); extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long); extern void bad_page_fault(struct pt_regs *, unsigned long, int); -extern int die(const char *, struct pt_regs *, long); extern void _exception(int, struct pt_regs *, int, unsigned long); +extern void die(const char *, struct pt_regs *, long); extern void _nmask_and_or_msr(unsigned long nmask, unsigned long or_val); #ifdef CONFIG_BOOKE_WDT diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 014f88f03d3f..c091527efd89 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -98,18 +98,14 @@ static void pmac_backlight_unblank(void) static inline void pmac_backlight_unblank(void) { } #endif -int die(const char *str, struct pt_regs *regs, long err) +static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; +static int die_counter; + +static unsigned __kprobes long oops_begin(struct pt_regs *regs) { - static struct { - raw_spinlock_t lock; - u32 lock_owner; - int lock_owner_depth; - } die = { - .lock = __RAW_SPIN_LOCK_UNLOCKED(die.lock), - .lock_owner = -1, - .lock_owner_depth = 0 - }; - static int die_counter; + int cpu; unsigned long flags; if (debugger(regs)) @@ -117,50 +113,37 @@ int die(const char *str, struct pt_regs *regs, long err) oops_enter(); - if (die.lock_owner != raw_smp_processor_id()) { - console_verbose(); - raw_spin_lock_irqsave(&die.lock, flags); - die.lock_owner = smp_processor_id(); - die.lock_owner_depth = 0; - bust_spinlocks(1); - if (machine_is(powermac)) - pmac_backlight_unblank(); - } else { - local_save_flags(flags); - } - - if (++die.lock_owner_depth < 3) { - printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP NR_CPUS=%d ", NR_CPUS); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC "); -#endif -#ifdef CONFIG_NUMA - printk("NUMA "); -#endif - printk("%s\n", ppc_md.name ? ppc_md.name : ""); - - if (notify_die(DIE_OOPS, str, regs, err, 255, - SIGSEGV) == NOTIFY_STOP) - return 1; - - print_modules(); - show_regs(regs); - } else { - printk("Recursive die() failure, output suppressed\n"); + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!arch_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + arch_spin_lock(&die_lock); } + die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); + if (machine_is(powermac)) + pmac_backlight_unblank(); + return flags; +} +static void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, + int signr) +{ bust_spinlocks(0); - die.lock_owner = -1; + die_owner = -1; add_taint(TAINT_DIE); + die_nest_count--; oops_exit(); printk("\n"); - raw_spin_unlock_irqrestore(&die.lock, flags); + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + arch_spin_unlock(&die_lock); + raw_local_irq_restore(flags); /* * A system reset (0x100) is a request to dump, so we always send @@ -177,6 +160,9 @@ int die(const char *str, struct pt_regs *regs, long err) crash_kexec_secondary(regs); } + if (!signr) + return; + /* * While our oops output is serialised by a spinlock, output * from panic() called below can race and corrupt it. If we @@ -190,15 +176,46 @@ int die(const char *str, struct pt_regs *regs, long err) if (in_interrupt()) panic("Fatal exception in interrupt"); - if (panic_on_oops) panic("Fatal exception"); + do_exit(signr); +} - do_exit(err); +static int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ + printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP NR_CPUS=%d ", NR_CPUS); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC "); +#endif +#ifdef CONFIG_NUMA + printk("NUMA "); +#endif + printk("%s\n", ppc_md.name ? ppc_md.name : ""); + + if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) == NOTIFY_STOP) + return 1; + + print_modules(); + show_regs(regs); return 0; } +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(regs); + + if (__die(str, regs, err)) + err = 0; + oops_end(flags, regs, err); +} + void user_single_step_siginfo(struct task_struct *tsk, struct pt_regs *regs, siginfo_t *info) { @@ -217,10 +234,11 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) "at %016lx nip %016lx lr %016lx code %x\n"; if (!user_mode(regs)) { - if (die("Exception in kernel mode", regs, signr)) - return; - } else if (show_unhandled_signals && - unhandled_signal(current, signr)) { + die("Exception in kernel mode", regs, signr); + return; + } + + if (show_unhandled_signals && unhandled_signal(current, signr)) { printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32, current->comm, current->pid, signr, addr, regs->nip, regs->link, code); From a934904d8ac2411ca329fc50daa29ab35a8f198b Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 30 Nov 2011 00:23:14 +0000 Subject: [PATCH 6/9] powerpc: Reduce pseries panic timeout from 180s to 10s We've had a 180 second panic timeout on ppc64 for as long as I can remember. This patch reduces it to 10 seconds on pseries for a few reasons: - Almost all pseries machines have a hypervisor console so panic output will be available in a scrollback buffer. - The 180 seconds impacts our availability, users (other than kernel hackers) just want the box to come back around so it can continue its work. - I spend a lot of my life staring at the 180 second panic timeout. Many pseries machines take minutes to power cycle, so it's quicker to sit through the 180 seconds than it is to power cycle. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/setup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 01df08dbc43c..992d797e07da 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -353,6 +353,8 @@ early_initcall(alloc_dispatch_log_kmem_cache); static void __init pSeries_setup_arch(void) { + panic_timeout = 10; + /* Discover PIC type and setup ppc_md accordingly */ pseries_discover_pic(); From 90e8f57cf873188bc5cff445059ddeb72dc51d8c Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 30 Nov 2011 00:23:15 +0000 Subject: [PATCH 7/9] powerpc/xics: Reset the CPPR if H_EOI fails I have an intermittent kdump fail where the hypervisor fails an H_EOI. As a result our CPPR is never reset to 0xff and we no longer accept interrupts. This patch calls icp_hv_set_cppr to reset the CPPR if H_EOI fails, fixing the kdump fail. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/sysdev/xics/icp-hv.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/sysdev/xics/icp-hv.c b/arch/powerpc/sysdev/xics/icp-hv.c index 784b3fc6f071..253dce98c16e 100644 --- a/arch/powerpc/sysdev/xics/icp-hv.c +++ b/arch/powerpc/sysdev/xics/icp-hv.c @@ -41,16 +41,6 @@ static inline unsigned int icp_hv_get_xirr(unsigned char cppr) return ret; } -static inline void icp_hv_set_xirr(unsigned int value) -{ - long rc = plpar_hcall_norets(H_EOI, value); - if (rc != H_SUCCESS) { - pr_err("%s: bad return code eoi xirr=0x%x returned %ld\n", - __func__, value, rc); - WARN_ON_ONCE(1); - } -} - static inline void icp_hv_set_cppr(u8 value) { long rc = plpar_hcall_norets(H_CPPR, value); @@ -61,6 +51,17 @@ static inline void icp_hv_set_cppr(u8 value) } } +static inline void icp_hv_set_xirr(unsigned int value) +{ + long rc = plpar_hcall_norets(H_EOI, value); + if (rc != H_SUCCESS) { + pr_err("%s: bad return code eoi xirr=0x%x returned %ld\n", + __func__, value, rc); + WARN_ON_ONCE(1); + icp_hv_set_cppr(value >> 24); + } +} + static inline void icp_hv_set_qirr(int n_cpu , u8 value) { int hw_cpu = get_hard_smp_processor_id(n_cpu); From 549e88a134b3b393a4312e8d76628b9260eee57f Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 30 Nov 2011 00:23:16 +0000 Subject: [PATCH 8/9] powerpc/kdump: Delay before sending IPI on a system reset If we enter the kdump code via system reset, wait a bit before sending the IPI to capture all secondary CPUs. Without it we race with the hypervisor that is issuing the system reset to each CPU. If the IPI gets there first the system reset oops output then shows the register state of the IPI handler which is not what we want. I took the opportunity to add defines for all the various delays we have. There's no need for cpu_relax when we are doing an mdelay, so remove them too. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/crash.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index d4467557b00e..b942980e9650 100644 --- a/arch/powerpc/kernel/crash.c +++ b/arch/powerpc/kernel/crash.c @@ -30,6 +30,20 @@ #include #include +/* + * The primary CPU waits a while for all secondary CPUs to enter. This is to + * avoid sending an IPI if the secondary CPUs are entering + * crash_kexec_secondary on their own (eg via a system reset). + * + * The secondary timeout has to be longer than the primary. Both timeouts are + * in milliseconds. + */ +#define PRIMARY_TIMEOUT 500 +#define SECONDARY_TIMEOUT 1000 + +#define IPI_TIMEOUT 10000 +#define REAL_MODE_TIMEOUT 10000 + /* This keeps a track of which one is the crashing cpu. */ int crashing_cpu = -1; static cpumask_t cpus_in_crash = CPU_MASK_NONE; @@ -99,11 +113,9 @@ again: * FIXME: Until we will have the way to stop other CPUs reliably, * the crash CPU will send an IPI and wait for other CPUs to * respond. - * Delay of at least 10 seconds. */ - msecs = 10000; + msecs = IPI_TIMEOUT; while ((cpumask_weight(&cpus_in_crash) < ncpus) && (--msecs > 0)) { - cpu_relax(); mdelay(1); } @@ -163,11 +175,11 @@ again: void crash_kexec_secondary(struct pt_regs *regs) { unsigned long flags; - int msecs = 500; + int msecs = SECONDARY_TIMEOUT; local_irq_save(flags); - /* Wait 500ms for the primary crash CPU to signal its progress */ + /* Wait for the primary crash CPU to signal its progress */ while (crashing_cpu < 0) { if (--msecs < 0) { /* No response, kdump image may not have been loaded */ @@ -176,7 +188,6 @@ void crash_kexec_secondary(struct pt_regs *regs) } mdelay(1); - cpu_relax(); } crash_ipi_callback(regs); @@ -211,7 +222,7 @@ static void crash_kexec_wait_realmode(int cpu) unsigned int msecs; int i; - msecs = 10000; + msecs = REAL_MODE_TIMEOUT; for (i=0; i < nr_cpu_ids && msecs > 0; i++) { if (i == cpu) continue; @@ -306,6 +317,14 @@ void default_machine_crash_shutdown(struct pt_regs *regs) */ crashing_cpu = smp_processor_id(); crash_save_cpu(regs, crashing_cpu); + + /* + * If we came in via system reset, wait a while for the secondary + * CPUs to enter. + */ + if (TRAP(regs) == 0x100) + mdelay(PRIMARY_TIMEOUT); + crash_kexec_prepare_cpus(crashing_cpu); cpumask_set_cpu(crashing_cpu, &cpus_in_crash); crash_kexec_wait_realmode(crashing_cpu); From 2440c01e10f07adcbc2094ba12ae4ad6094bd2b6 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 30 Nov 2011 00:23:17 +0000 Subject: [PATCH 9/9] powerpc/kdump: Only save CPU state first time through the secondary CPU capture code We might enter the secondary CPU capture code twice, eg if we have to unstick some CPUs with a system reset. In this case we don't want to overwrite the state on CPUs that had made it into the capture code OK, so use the cpus_state_saved cpumask for that and make it local to crash_ipi_callback. For controlling progress now use atomic_t cpus_in_crash to count how many CPUs have made it into the kdump code, and time_to_dump to tell everyone it's time to dump. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/crash.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index b942980e9650..28be3452e67a 100644 --- a/arch/powerpc/kernel/crash.c +++ b/arch/powerpc/kernel/crash.c @@ -46,7 +46,8 @@ /* This keeps a track of which one is the crashing cpu. */ int crashing_cpu = -1; -static cpumask_t cpus_in_crash = CPU_MASK_NONE; +static atomic_t cpus_in_crash; +static int time_to_dump; #define CRASH_HANDLER_MAX 3 /* NULL terminated list of shutdown handles */ @@ -67,21 +68,27 @@ static int handle_fault(struct pt_regs *regs) void crash_ipi_callback(struct pt_regs *regs) { + static cpumask_t cpus_state_saved = CPU_MASK_NONE; + int cpu = smp_processor_id(); if (!cpu_online(cpu)) return; hard_irq_disable(); - if (!cpumask_test_cpu(cpu, &cpus_in_crash)) + if (!cpumask_test_cpu(cpu, &cpus_state_saved)) { crash_save_cpu(regs, cpu); - cpumask_set_cpu(cpu, &cpus_in_crash); + cpumask_set_cpu(cpu, &cpus_state_saved); + } + + atomic_inc(&cpus_in_crash); + smp_mb__after_atomic_inc(); /* * Starting the kdump boot. * This barrier is needed to make sure that all CPUs are stopped. */ - while (!cpumask_test_cpu(crashing_cpu, &cpus_in_crash)) + while (!time_to_dump) cpu_relax(); if (ppc_md.kexec_cpu_down) @@ -115,19 +122,18 @@ again: * respond. */ msecs = IPI_TIMEOUT; - while ((cpumask_weight(&cpus_in_crash) < ncpus) && (--msecs > 0)) { + while ((atomic_read(&cpus_in_crash) < ncpus) && (--msecs > 0)) mdelay(1); - } /* Would it be better to replace the trap vector here? */ - if (cpumask_weight(&cpus_in_crash) >= ncpus) { + if (atomic_read(&cpus_in_crash) >= ncpus) { printk(KERN_EMERG "IPI complete\n"); return; } printk(KERN_EMERG "ERROR: %d cpu(s) not responding\n", - ncpus - cpumask_weight(&cpus_in_crash)); + ncpus - atomic_read(&cpus_in_crash)); /* * If we have a panic timeout set then we can't wait indefinitely @@ -155,10 +161,10 @@ again: * crash code again. We need to reset cpus_in_crash so we * wait for everyone to do this. */ - cpus_in_crash = CPU_MASK_NONE; + atomic_set(&cpus_in_crash, 0); smp_mb(); - while (cpumask_weight(&cpus_in_crash) < ncpus) + while (atomic_read(&cpus_in_crash) < ncpus) cpu_relax(); } @@ -316,7 +322,6 @@ void default_machine_crash_shutdown(struct pt_regs *regs) * such that another IPI will not be sent. */ crashing_cpu = smp_processor_id(); - crash_save_cpu(regs, crashing_cpu); /* * If we came in via system reset, wait a while for the secondary @@ -326,7 +331,11 @@ void default_machine_crash_shutdown(struct pt_regs *regs) mdelay(PRIMARY_TIMEOUT); crash_kexec_prepare_cpus(crashing_cpu); - cpumask_set_cpu(crashing_cpu, &cpus_in_crash); + + crash_save_cpu(regs, crashing_cpu); + + time_to_dump = 1; + crash_kexec_wait_realmode(crashing_cpu); machine_kexec_mask_interrupts();