perf arm64: Inject missing frames when using 'perf record --call-graph=fp'
When unwinding using frame pointers on ARM64, the return address of the
current function may not have been pushed into the stack when a function
was interrupted, which makes perf show an incorrect call graph to the
user.
Consider the following example program:
void leaf() {
/* long computation */
}
void parent() {
// (1)
leaf();
// (2)
}
... could be compiled into (using gcc -fno-inline -fno-omit-frame-pointer):
leaf:
/* long computation */
nop
ret
parent:
// (1)
stp x29, x30, [sp, -16]!
mov x29, sp
bl parent
nop
ldp x29, x30, [sp], 16
// (2)
ret
If the program is interrupted at (1), (2), or any point in "leaf:", the
call graph will skip the callers of the current function. We can unwind
using the dwarf info and check if the return addr is the same as the LR
register, and inject the missing frame into the call graph.
Before this patch, the above example shows the following call-graph when
recording using "--call-graph fp" mode in ARM64:
# Children Self Command Shared Object Symbol
# ........ ........ ........ ................ ......................
#
99.86% 99.86% program3 program3 [.] leaf
|
---_start
__libc_start_main
main
leaf
As can be seen, the "parent" function is missing. This is specially
problematic in "leaf" because for leaf functions the compiler may always
omit pushing the return addr into the stack. After this patch, it shows
the correct graph:
# Children Self Command Shared Object Symbol
# ........ ........ ........ ................ ......................
#
99.86% 99.86% program3 program3 [.] leaf
|
---_start
__libc_start_main
main
parent
leaf
Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Alexandre Truong <alexandre.truong@arm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20211217154521.80603-7-german.gomez@arm.com
Signed-off-by: German Gomez <german.gomez@arm.com>
[ Rename machine__normalize_is() to machine__normalized_is(), as suggested by James Clark ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2021-12-17 23:45:20 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include "arm64-frame-pointer-unwind-support.h"
|
|
|
|
#include "callchain.h"
|
|
|
|
#include "event.h"
|
|
|
|
#include "perf_regs.h" // SMPL_REG_MASK
|
|
|
|
#include "unwind.h"
|
|
|
|
|
|
|
|
#define perf_event_arm_regs perf_event_arm64_regs
|
2022-01-14 14:48:22 +08:00
|
|
|
#include "../../arch/arm64/include/uapi/asm/perf_regs.h"
|
perf arm64: Inject missing frames when using 'perf record --call-graph=fp'
When unwinding using frame pointers on ARM64, the return address of the
current function may not have been pushed into the stack when a function
was interrupted, which makes perf show an incorrect call graph to the
user.
Consider the following example program:
void leaf() {
/* long computation */
}
void parent() {
// (1)
leaf();
// (2)
}
... could be compiled into (using gcc -fno-inline -fno-omit-frame-pointer):
leaf:
/* long computation */
nop
ret
parent:
// (1)
stp x29, x30, [sp, -16]!
mov x29, sp
bl parent
nop
ldp x29, x30, [sp], 16
// (2)
ret
If the program is interrupted at (1), (2), or any point in "leaf:", the
call graph will skip the callers of the current function. We can unwind
using the dwarf info and check if the return addr is the same as the LR
register, and inject the missing frame into the call graph.
Before this patch, the above example shows the following call-graph when
recording using "--call-graph fp" mode in ARM64:
# Children Self Command Shared Object Symbol
# ........ ........ ........ ................ ......................
#
99.86% 99.86% program3 program3 [.] leaf
|
---_start
__libc_start_main
main
leaf
As can be seen, the "parent" function is missing. This is specially
problematic in "leaf" because for leaf functions the compiler may always
omit pushing the return addr into the stack. After this patch, it shows
the correct graph:
# Children Self Command Shared Object Symbol
# ........ ........ ........ ................ ......................
#
99.86% 99.86% program3 program3 [.] leaf
|
---_start
__libc_start_main
main
parent
leaf
Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Alexandre Truong <alexandre.truong@arm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20211217154521.80603-7-german.gomez@arm.com
Signed-off-by: German Gomez <german.gomez@arm.com>
[ Rename machine__normalize_is() to machine__normalized_is(), as suggested by James Clark ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2021-12-17 23:45:20 +08:00
|
|
|
#undef perf_event_arm_regs
|
|
|
|
|
|
|
|
struct entries {
|
|
|
|
u64 stack[2];
|
|
|
|
size_t length;
|
|
|
|
};
|
|
|
|
|
|
|
|
static bool get_leaf_frame_caller_enabled(struct perf_sample *sample)
|
|
|
|
{
|
|
|
|
return callchain_param.record_mode == CALLCHAIN_FP && sample->user_regs.regs
|
|
|
|
&& sample->user_regs.mask & SMPL_REG_MASK(PERF_REG_ARM64_LR);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int add_entry(struct unwind_entry *entry, void *arg)
|
|
|
|
{
|
|
|
|
struct entries *entries = arg;
|
|
|
|
|
|
|
|
entries->stack[entries->length++] = entry->ip;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
u64 get_leaf_frame_caller_aarch64(struct perf_sample *sample, struct thread *thread, int usr_idx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct entries entries = {};
|
|
|
|
struct regs_dump old_regs = sample->user_regs;
|
|
|
|
|
|
|
|
if (!get_leaf_frame_caller_enabled(sample))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If PC and SP are not recorded, get the value of PC from the stack
|
|
|
|
* and set its mask. SP is not used when doing the unwinding but it
|
|
|
|
* still needs to be set to prevent failures.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (!(sample->user_regs.mask & SMPL_REG_MASK(PERF_REG_ARM64_PC))) {
|
|
|
|
sample->user_regs.cache_mask |= SMPL_REG_MASK(PERF_REG_ARM64_PC);
|
|
|
|
sample->user_regs.cache_regs[PERF_REG_ARM64_PC] = sample->callchain->ips[usr_idx+1];
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!(sample->user_regs.mask & SMPL_REG_MASK(PERF_REG_ARM64_SP))) {
|
|
|
|
sample->user_regs.cache_mask |= SMPL_REG_MASK(PERF_REG_ARM64_SP);
|
|
|
|
sample->user_regs.cache_regs[PERF_REG_ARM64_SP] = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = unwind__get_entries(add_entry, &entries, thread, sample, 2);
|
|
|
|
sample->user_regs = old_regs;
|
|
|
|
|
|
|
|
if (ret || entries.length != 2)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
return callchain_param.order == ORDER_CALLER ? entries.stack[0] : entries.stack[1];
|
|
|
|
}
|