From 9e46294dadedc0c04adcb8ce760bd2cd74f7332d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 2 Jul 2011 15:00:52 +0200 Subject: [PATCH 1/6] x86: Save stack pointer in perf live regs savings In order to prepare for fetching the stack pointer from the regs when possible in dump_trace() instead of taking the local one, save the current stack pointer in perf live regs saving. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo --- arch/x86/include/asm/perf_event.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index d9d4dae305f6..094fb30817ab 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -152,6 +152,11 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs); (regs)->bp = caller_frame_pointer(); \ (regs)->cs = __KERNEL_CS; \ regs->flags = 0; \ + asm volatile( \ + _ASM_MOV "%%"_ASM_SP ", %0\n" \ + : "=m" ((regs)->sp) \ + :: "memory" \ + ); \ } #else From 47ce11a2b6519f9c7843223ea8e561eb71ea5896 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 Jun 2011 19:04:56 +0200 Subject: [PATCH 2/6] x86: Fetch stack from regs when possible in dump_trace() When regs are passed to dump_stack(), we fetch the frame pointer from the regs but the stack pointer is taken from the current frame. Thus the frame and stack pointers may not come from the same context. For example this can result in the unwinder to think the context is in irq, due to the current value of the stack, but the frame pointer coming from the regs points to a frame from another place. It then tries to fix up the irq link but ends up dereferencing a random frame pointer that doesn't belong to the irq stack: [ 9131.706906] ------------[ cut here ]------------ [ 9131.707003] WARNING: at arch/x86/kernel/dumpstack_64.c:129 dump_trace+0x2aa/0x330() [ 9131.707003] Hardware name: AMD690VM-FMH [ 9131.707003] Perf: bad frame pointer = 0000000000000005 in callchain [ 9131.707003] Modules linked in: [ 9131.707003] Pid: 1050, comm: perf Not tainted 3.0.0-rc3+ #181 [ 9131.707003] Call Trace: [ 9131.707003] [] warn_slowpath_common+0x7a/0xb0 [ 9131.707003] [] warn_slowpath_fmt+0x41/0x50 [ 9131.707003] [] ? bad_to_user+0x6d/0x10be [ 9131.707003] [] dump_trace+0x2aa/0x330 [ 9131.707003] [] ? native_sched_clock+0x13/0x50 [ 9131.707003] [] perf_callchain_kernel+0x54/0x70 [ 9131.707003] [] perf_prepare_sample+0x19f/0x2a0 [ 9131.707003] [] __perf_event_overflow+0x16c/0x290 [ 9131.707003] [] ? __perf_event_overflow+0x130/0x290 [ 9131.707003] [] ? native_sched_clock+0x13/0x50 [ 9131.707003] [] ? sched_clock+0x9/0x10 [ 9131.707003] [] ? T.375+0x15/0x90 [ 9131.707003] [] ? trace_hardirqs_on_caller+0x64/0x180 [ 9131.707003] [] ? trace_hardirqs_off+0xd/0x10 [ 9131.707003] [] perf_event_overflow+0x14/0x20 [ 9131.707003] [] perf_swevent_hrtimer+0x11c/0x130 [ 9131.707003] [] ? error_exit+0x51/0xb0 [ 9131.707003] [] __run_hrtimer+0x83/0x1e0 [ 9131.707003] [] ? perf_event_overflow+0x20/0x20 [ 9131.707003] [] hrtimer_interrupt+0x106/0x250 [ 9131.707003] [] ? trace_hardirqs_off_thunk+0x3a/0x3c [ 9131.707003] [] smp_apic_timer_interrupt+0x53/0x90 [ 9131.707003] [] apic_timer_interrupt+0x13/0x20 [ 9131.707003] [] ? error_exit+0x51/0xb0 [ 9131.707003] [] ? error_exit+0x4c/0xb0 [ 9131.707003] ---[ end trace b2560d4876709347 ]--- Fix this by simply taking the stack pointer from regs->sp when regs are provided. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo --- arch/x86/kernel/dumpstack_64.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index e71c98d3c0d2..788295cbe4a7 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -155,9 +155,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, task = current; if (!stack) { - stack = &dummy; - if (task && task != current) + if (regs) + stack = (unsigned long *)regs->sp; + else if (task && task != current) stack = (unsigned long *)task->thread.sp; + else + stack = &dummy; } if (!bp) From 1871853f7abc3c727c4346539c5062cbeaf016a4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 1 Jul 2011 01:51:22 +0200 Subject: [PATCH 3/6] x86,64: Simplify save_regs() The save_regs function that saves the regs on low level irq entry is complicated because of the fact it changes its stack in the middle and also because it manipulates data allocated in the caller frame and accesses there are directly calculated from callee rsp value with the return address in the middle of the way. This complicates the static stack offsets calculation and require more dynamic ones. It also needs a save/restore of the function's return address. To simplify and optimize this, turn save_regs() into a macro. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/entry_64.S | 42 +++++++++++++++----------------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8a445a0c989e..b6b2e85454cf 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -297,27 +297,22 @@ ENDPROC(native_usergs_sysret64) .endm /* save partial stack frame */ - .pushsection .kprobes.text, "ax" -ENTRY(save_args) - XCPT_FRAME + .macro SAVE_ARGS_IRQ cld - /* - * start from rbp in pt_regs and jump over - * return address. - */ - movq_cfi rdi, RDI+8-RBP - movq_cfi rsi, RSI+8-RBP - movq_cfi rdx, RDX+8-RBP - movq_cfi rcx, RCX+8-RBP - movq_cfi rax, RAX+8-RBP - movq_cfi r8, R8+8-RBP - movq_cfi r9, R9+8-RBP - movq_cfi r10, R10+8-RBP - movq_cfi r11, R11+8-RBP + /* start from rbp in pt_regs and jump over */ + movq_cfi rdi, RDI-RBP + movq_cfi rsi, RSI-RBP + movq_cfi rdx, RDX-RBP + movq_cfi rcx, RCX-RBP + movq_cfi rax, RAX-RBP + movq_cfi r8, R8-RBP + movq_cfi r9, R9-RBP + movq_cfi r10, R10-RBP + movq_cfi r11, R11-RBP - leaq -RBP+8(%rsp),%rdi /* arg1 for handler */ - movq_cfi rbp, 8 /* push %rbp */ - leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ + leaq -RBP(%rsp),%rdi /* arg1 for handler */ + movq_cfi rbp, 0 /* push %rbp */ + movq %rsp, %rbp testl $3, CS(%rdi) je 1f SWAPGS @@ -329,19 +324,14 @@ ENTRY(save_args) */ 1: incl PER_CPU_VAR(irq_count) jne 2f - popq_cfi %rax /* move return address... */ mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 pushq_cfi %rbp /* backlink for unwinder */ - pushq_cfi %rax /* ... to the new stack */ /* * We entered an interrupt context - irqs are off: */ 2: TRACE_IRQS_OFF - ret - CFI_ENDPROC -END(save_args) - .popsection + .endm ENTRY(save_rest) PARTIAL_FRAME 1 REST_SKIP+8 @@ -791,7 +781,7 @@ END(interrupt) /* reserve pt_regs for scratch regs and rbp */ subq $ORIG_RAX-RBP, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP - call save_args + SAVE_ARGS_IRQ PARTIAL_FRAME 0 call \func .endm From 3b99a3ef55b292180473a221f3d6bc24455f0632 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 1 Jul 2011 02:25:17 +0200 Subject: [PATCH 4/6] x86,64: Separate arg1 from rbp handling in SAVE_REGS_IRQ Just for clarity in the code. Have a first block that handles the frame pointer and a separate one that handles pt_regs pointer and its use. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/entry_64.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b6b2e85454cf..20dc8e6b6d80 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -310,9 +310,10 @@ ENDPROC(native_usergs_sysret64) movq_cfi r10, R10-RBP movq_cfi r11, R11-RBP - leaq -RBP(%rsp),%rdi /* arg1 for handler */ movq_cfi rbp, 0 /* push %rbp */ movq %rsp, %rbp + + leaq -RBP(%rsp),%rdi /* arg1 for handler */ testl $3, CS(%rdi) je 1f SWAPGS From 48ffee7d9e6df51b4957bed64115b7beed671374 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 2 Jul 2011 15:03:44 +0200 Subject: [PATCH 5/6] x86: Remove useless unwinder backlink from irq regs saving The unwinder backlink in interrupt entry is very useless. It's actually not part of the stack frame chain and thus is never used. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/entry_64.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 20dc8e6b6d80..6131432c5afa 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -327,7 +327,6 @@ ENDPROC(native_usergs_sysret64) jne 2f mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 - pushq_cfi %rbp /* backlink for unwinder */ /* * We entered an interrupt context - irqs are off: */ From a2bbe75089d5eb9a3a46d50dd5c215e213790288 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 2 Jul 2011 16:52:45 +0200 Subject: [PATCH 6/6] x86: Don't use frame pointer to save old stack on irq entry rbp is used in SAVE_ARGS_IRQ to save the old stack pointer in order to restore it later in ret_from_intr. It is convenient because we save its value in the irq regs and it's easily restored using the leave instruction. However this is a kind of abuse of the frame pointer which role is to help unwinding the kernel by chaining frames together, each node following the return address to the previous frame. But although we are breaking the frame by changing the stack pointer, there is no preceding return address before the new frame. Hence using the frame pointer to link the two stacks breaks the stack unwinders that find a random value instead of a return address here. There is no workaround that can work in every case. We are using the fixup_bp_irq_link() function to dereference that abused frame pointer in the case of non nesting interrupt (which means stack changed). But that doesn't fix the case of interrupts that don't change the stack (but we still have the unconditional frame link), which is the case of hardirq interrupting softirq. We have no way to detect this transition so the frame irq link is considered as a real frame pointer and the return address is dereferenced but it is still a spurious one. There are two possible results of this: either the spurious return address, a random stack value, luckily belongs to the kernel text and then the unwinding can continue and we just have a weird entry in the stack trace. Or it doesn't belong to the kernel text and unwinding stops there. This is the reason why stacktraces (including perf callchains) on irqs that interrupted softirqs don't work very well. To solve this, we don't save the old stack pointer on rbp anymore but we save it to a scratch register that we push on the new stack and that we pop back later on irq return. This preserves the whole frame chain without spurious return addresses in the middle and drops the need for the horrid fixup_bp_irq_link() workaround. And finally irqs that interrupt softirq are sanely unwinded. Before: 99.81% perf [kernel.kallsyms] [k] perf_pending_event | --- perf_pending_event irq_work_run smp_irq_work_interrupt irq_work_interrupt | |--41.60%-- __read | | | |--99.90%-- create_worker | | bench_sched_messaging | | cmd_bench | | run_builtin | | main | | __libc_start_main | --0.10%-- [...] After: 1.64% swapper [kernel.kallsyms] [k] perf_pending_event | --- perf_pending_event irq_work_run smp_irq_work_interrupt irq_work_interrupt | |--95.00%-- arch_irq_work_raise | irq_work_queue | __perf_event_overflow | perf_swevent_overflow | perf_swevent_event | perf_tp_event | perf_trace_softirq | __do_softirq | call_softirq | do_softirq | irq_exit | | | |--73.68%-- smp_apic_timer_interrupt | | apic_timer_interrupt | | | | | |--96.43%-- amd_e400_idle | | | cpu_idle | | | start_secondary Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/dumpstack_64.c | 30 ------------------------------ arch/x86/kernel/entry_64.S | 27 +++++++++++++++------------ 2 files changed, 15 insertions(+), 42 deletions(-) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 788295cbe4a7..19853ad8afc5 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -104,34 +104,6 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack, return (stack >= irq_stack && stack < irq_stack_end); } -/* - * We are returning from the irq stack and go to the previous one. - * If the previous stack is also in the irq stack, then bp in the first - * frame of the irq stack points to the previous, interrupted one. - * Otherwise we have another level of indirection: We first save - * the bp of the previous stack, then we switch the stack to the irq one - * and save a new bp that links to the previous one. - * (See save_args()) - */ -static inline unsigned long -fixup_bp_irq_link(unsigned long bp, unsigned long *stack, - unsigned long *irq_stack, unsigned long *irq_stack_end) -{ -#ifdef CONFIG_FRAME_POINTER - struct stack_frame *frame = (struct stack_frame *)bp; - unsigned long next; - - if (!in_irq_stack(stack, irq_stack, irq_stack_end)) { - if (!probe_kernel_address(&frame->next_frame, next)) - return next; - else - WARN_ONCE(1, "Perf: bad frame pointer = %p in " - "callchain\n", &frame->next_frame); - } -#endif - return bp; -} - /* * x86-64 can have up to three kernel stacks: * process stack @@ -208,8 +180,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * pointer (index -1 to end) in the IRQ stack: */ stack = (unsigned long *) (irq_stack_end[-1]); - bp = fixup_bp_irq_link(bp, stack, irq_stack, - irq_stack_end); irq_stack_end = NULL; ops->stack(data, "EOI"); continue; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 6131432c5afa..d656f68371a4 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -310,8 +310,11 @@ ENDPROC(native_usergs_sysret64) movq_cfi r10, R10-RBP movq_cfi r11, R11-RBP - movq_cfi rbp, 0 /* push %rbp */ - movq %rsp, %rbp + /* Save rbp so that we can unwind from get_irq_regs() */ + movq_cfi rbp, 0 + + /* Save previous stack value */ + movq %rsp, %rsi leaq -RBP(%rsp),%rdi /* arg1 for handler */ testl $3, CS(%rdi) @@ -327,10 +330,11 @@ ENDPROC(native_usergs_sysret64) jne 2f mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 - /* - * We entered an interrupt context - irqs are off: - */ -2: TRACE_IRQS_OFF + +2: /* Store previous stack value */ + pushq %rsi + /* We entered an interrupt context - irqs are off: */ + TRACE_IRQS_OFF .endm ENTRY(save_rest) @@ -804,15 +808,14 @@ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF decl PER_CPU_VAR(irq_count) - leaveq - CFI_RESTORE rbp + /* Restore saved previous stack */ + popq %rsi + leaq 16(%rsi), %rsp + CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET -8 + CFI_ADJUST_CFA_OFFSET -16 - /* we did not save rbx, restore only from ARGOFFSET */ - addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 exit_intr: GET_THREAD_INFO(%rcx) testl $3,CS-ARGOFFSET(%rsp)