2019-05-29 22:12:25 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2011-01-30 20:46:46 +08:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
|
|
|
|
*
|
|
|
|
* Parts came from builtin-{top,stat,record}.c, see those files for further
|
|
|
|
* copyright notes.
|
|
|
|
*/
|
|
|
|
|
2011-09-06 23:12:26 +08:00
|
|
|
#include <byteswap.h>
|
2017-04-18 21:46:11 +08:00
|
|
|
#include <errno.h>
|
2017-04-18 02:23:08 +08:00
|
|
|
#include <inttypes.h>
|
2012-08-07 21:20:45 +08:00
|
|
|
#include <linux/bitops.h>
|
2017-06-20 23:05:38 +08:00
|
|
|
#include <api/fs/fs.h>
|
2015-09-02 15:56:43 +08:00
|
|
|
#include <api/fs/tracing_path.h>
|
2013-06-11 23:29:18 +08:00
|
|
|
#include <traceevent/event-parse.h>
|
|
|
|
#include <linux/hw_breakpoint.h>
|
|
|
|
#include <linux/perf_event.h>
|
2017-06-16 23:18:27 +08:00
|
|
|
#include <linux/compiler.h>
|
2015-09-07 16:38:06 +08:00
|
|
|
#include <linux/err.h>
|
2019-07-04 22:32:27 +08:00
|
|
|
#include <linux/zalloc.h>
|
2017-04-20 06:03:14 +08:00
|
|
|
#include <sys/ioctl.h>
|
2013-08-05 10:41:26 +08:00
|
|
|
#include <sys/resource.h>
|
2017-06-20 23:05:38 +08:00
|
|
|
#include <sys/types.h>
|
|
|
|
#include <dirent.h>
|
2019-08-31 01:45:20 +08:00
|
|
|
#include <stdlib.h>
|
2019-07-21 19:24:24 +08:00
|
|
|
#include <perf/evsel.h>
|
2013-06-11 23:29:18 +08:00
|
|
|
#include "asm/bug.h"
|
2014-10-10 03:12:24 +08:00
|
|
|
#include "callchain.h"
|
2014-10-17 23:17:40 +08:00
|
|
|
#include "cgroup.h"
|
2019-08-22 01:20:54 +08:00
|
|
|
#include "counts.h"
|
2017-04-26 02:30:47 +08:00
|
|
|
#include "event.h"
|
2011-01-04 02:39:04 +08:00
|
|
|
#include "evsel.h"
|
2019-09-30 22:48:32 +08:00
|
|
|
#include "util/env.h"
|
2019-09-25 02:56:14 +08:00
|
|
|
#include "util/evsel_config.h"
|
2019-09-25 02:41:51 +08:00
|
|
|
#include "util/evsel_fprintf.h"
|
2011-01-13 03:03:24 +08:00
|
|
|
#include "evlist.h"
|
2019-09-10 23:29:02 +08:00
|
|
|
#include <perf/cpumap.h>
|
2011-01-19 01:15:24 +08:00
|
|
|
#include "thread_map.h"
|
2012-04-26 13:15:22 +08:00
|
|
|
#include "target.h"
|
2012-08-07 21:20:47 +08:00
|
|
|
#include "perf_regs.h"
|
2019-08-23 02:40:29 +08:00
|
|
|
#include "record.h"
|
2013-08-14 20:48:24 +08:00
|
|
|
#include "debug.h"
|
2013-12-03 21:09:24 +08:00
|
|
|
#include "trace-event.h"
|
2015-06-14 16:19:26 +08:00
|
|
|
#include "stat.h"
|
2019-06-26 04:31:26 +08:00
|
|
|
#include "string2.h"
|
2017-11-30 02:43:46 +08:00
|
|
|
#include "memswap.h"
|
2019-08-27 22:51:18 +08:00
|
|
|
#include "util.h"
|
2019-08-30 01:59:50 +08:00
|
|
|
#include "../perf-sys.h"
|
2016-10-13 05:02:06 +08:00
|
|
|
#include "util/parse-branch-options.h"
|
2019-08-21 22:30:29 +08:00
|
|
|
#include <internal/xyarray.h>
|
2019-09-03 21:56:06 +08:00
|
|
|
#include <internal/lib.h>
|
2011-01-04 02:39:04 +08:00
|
|
|
|
tools perf: Move from sane_ctype.h obtained from git to the Linux's original
We got the sane_ctype.h headers from git and kept using it so far, but
since that code originally came from the kernel sources to the git
sources, perhaps its better to just use the one in the kernel, so that
we can leverage tools/perf/check_headers.sh to be notified when our copy
gets out of sync, i.e. when fixes or goodies are added to the code we've
copied.
This will help with things like tools/lib/string.c where we want to have
more things in common with the kernel, such as strim(), skip_spaces(),
etc so as to go on removing the things that we have in tools/perf/util/
and instead using the code in the kernel, indirectly and removing things
like EXPORT_SYMBOL(), etc, getting notified when fixes and improvements
are made to the original code.
Hopefully this also should help with reducing the difference of code
hosted in tools/ to the one in the kernel proper.
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lkml.kernel.org/n/tip-7k9868l713wqtgo01xxygn12@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2019-06-26 04:27:31 +08:00
|
|
|
#include <linux/ctype.h>
|
2017-04-18 03:10:49 +08:00
|
|
|
|
2018-02-02 22:27:25 +08:00
|
|
|
struct perf_missing_features perf_missing_features;
|
2012-12-14 00:13:07 +08:00
|
|
|
|
2015-03-31 06:19:31 +08:00
|
|
|
static clockid_t clockid;
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
static int perf_evsel__no_extra_init(struct evsel *evsel __maybe_unused)
|
2014-10-10 02:29:51 +08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-07-03 22:50:18 +08:00
|
|
|
void __weak test_attr__ready(void) { }
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
static void perf_evsel__no_extra_fini(struct evsel *evsel __maybe_unused)
|
2014-10-10 02:29:51 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct {
|
|
|
|
size_t size;
|
2019-07-21 19:23:51 +08:00
|
|
|
int (*init)(struct evsel *evsel);
|
|
|
|
void (*fini)(struct evsel *evsel);
|
2014-10-10 02:29:51 +08:00
|
|
|
} perf_evsel__object = {
|
2019-07-21 19:23:51 +08:00
|
|
|
.size = sizeof(struct evsel),
|
2014-10-10 02:29:51 +08:00
|
|
|
.init = perf_evsel__no_extra_init,
|
|
|
|
.fini = perf_evsel__no_extra_fini,
|
|
|
|
};
|
|
|
|
|
|
|
|
int perf_evsel__object_config(size_t object_size,
|
2019-07-21 19:23:51 +08:00
|
|
|
int (*init)(struct evsel *evsel),
|
|
|
|
void (*fini)(struct evsel *evsel))
|
2014-10-10 02:29:51 +08:00
|
|
|
{
|
|
|
|
|
|
|
|
if (object_size == 0)
|
|
|
|
goto set_methods;
|
|
|
|
|
|
|
|
if (perf_evsel__object.size > object_size)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
perf_evsel__object.size = object_size;
|
|
|
|
|
|
|
|
set_methods:
|
|
|
|
if (init != NULL)
|
|
|
|
perf_evsel__object.init = init;
|
|
|
|
|
|
|
|
if (fini != NULL)
|
|
|
|
perf_evsel__object.fini = fini;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:24:45 +08:00
|
|
|
#define FD(e, x, y) (*(int *)xyarray__entry(e->core.fd, x, y))
|
2011-01-04 03:45:52 +08:00
|
|
|
|
2020-04-30 03:00:27 +08:00
|
|
|
int __evsel__sample_size(u64 sample_type)
|
2011-06-02 22:04:54 +08:00
|
|
|
{
|
|
|
|
u64 mask = sample_type & PERF_SAMPLE_MASK;
|
|
|
|
int size = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < 64; i++) {
|
|
|
|
if (mask & (1ULL << i))
|
|
|
|
size++;
|
|
|
|
}
|
|
|
|
|
|
|
|
size *= sizeof(u64);
|
|
|
|
|
|
|
|
return size;
|
|
|
|
}
|
|
|
|
|
2013-08-27 16:23:09 +08:00
|
|
|
/**
|
|
|
|
* __perf_evsel__calc_id_pos - calculate id_pos.
|
|
|
|
* @sample_type: sample type
|
|
|
|
*
|
|
|
|
* This function returns the position of the event id (PERF_SAMPLE_ID or
|
|
|
|
* PERF_SAMPLE_IDENTIFIER) in a sample event i.e. in the array of struct
|
2019-08-27 06:02:31 +08:00
|
|
|
* perf_record_sample.
|
2013-08-27 16:23:09 +08:00
|
|
|
*/
|
|
|
|
static int __perf_evsel__calc_id_pos(u64 sample_type)
|
|
|
|
{
|
|
|
|
int idx = 0;
|
|
|
|
|
|
|
|
if (sample_type & PERF_SAMPLE_IDENTIFIER)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (!(sample_type & PERF_SAMPLE_ID))
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
if (sample_type & PERF_SAMPLE_IP)
|
|
|
|
idx += 1;
|
|
|
|
|
|
|
|
if (sample_type & PERF_SAMPLE_TID)
|
|
|
|
idx += 1;
|
|
|
|
|
|
|
|
if (sample_type & PERF_SAMPLE_TIME)
|
|
|
|
idx += 1;
|
|
|
|
|
|
|
|
if (sample_type & PERF_SAMPLE_ADDR)
|
|
|
|
idx += 1;
|
|
|
|
|
|
|
|
return idx;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* __perf_evsel__calc_is_pos - calculate is_pos.
|
|
|
|
* @sample_type: sample type
|
|
|
|
*
|
|
|
|
* This function returns the position (counting backwards) of the event id
|
|
|
|
* (PERF_SAMPLE_ID or PERF_SAMPLE_IDENTIFIER) in a non-sample event i.e. if
|
|
|
|
* sample_id_all is used there is an id sample appended to non-sample events.
|
|
|
|
*/
|
|
|
|
static int __perf_evsel__calc_is_pos(u64 sample_type)
|
|
|
|
{
|
|
|
|
int idx = 1;
|
|
|
|
|
|
|
|
if (sample_type & PERF_SAMPLE_IDENTIFIER)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (!(sample_type & PERF_SAMPLE_ID))
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
if (sample_type & PERF_SAMPLE_CPU)
|
|
|
|
idx += 1;
|
|
|
|
|
|
|
|
if (sample_type & PERF_SAMPLE_STREAM_ID)
|
|
|
|
idx += 1;
|
|
|
|
|
|
|
|
return idx;
|
|
|
|
}
|
|
|
|
|
2020-04-30 02:58:40 +08:00
|
|
|
void evsel__calc_id_pos(struct evsel *evsel)
|
2013-08-27 16:23:09 +08:00
|
|
|
{
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->id_pos = __perf_evsel__calc_id_pos(evsel->core.attr.sample_type);
|
|
|
|
evsel->is_pos = __perf_evsel__calc_is_pos(evsel->core.attr.sample_type);
|
2013-08-27 16:23:09 +08:00
|
|
|
}
|
|
|
|
|
2020-04-30 03:12:15 +08:00
|
|
|
void __evsel__set_sample_bit(struct evsel *evsel,
|
2012-12-11 01:53:43 +08:00
|
|
|
enum perf_event_sample_format bit)
|
|
|
|
{
|
2019-07-21 19:24:29 +08:00
|
|
|
if (!(evsel->core.attr.sample_type & bit)) {
|
|
|
|
evsel->core.attr.sample_type |= bit;
|
2012-12-11 01:53:43 +08:00
|
|
|
evsel->sample_size += sizeof(u64);
|
2020-04-30 02:58:40 +08:00
|
|
|
evsel__calc_id_pos(evsel);
|
2012-12-11 01:53:43 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-30 03:12:15 +08:00
|
|
|
void __evsel__reset_sample_bit(struct evsel *evsel,
|
2012-12-11 01:53:43 +08:00
|
|
|
enum perf_event_sample_format bit)
|
|
|
|
{
|
2019-07-21 19:24:29 +08:00
|
|
|
if (evsel->core.attr.sample_type & bit) {
|
|
|
|
evsel->core.attr.sample_type &= ~bit;
|
2012-12-11 01:53:43 +08:00
|
|
|
evsel->sample_size -= sizeof(u64);
|
2020-04-30 02:58:40 +08:00
|
|
|
evsel__calc_id_pos(evsel);
|
2012-12-11 01:53:43 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-30 03:12:15 +08:00
|
|
|
void evsel__set_sample_id(struct evsel *evsel,
|
2013-08-27 16:23:09 +08:00
|
|
|
bool can_sample_identifier)
|
2012-12-11 02:21:30 +08:00
|
|
|
{
|
2013-08-27 16:23:09 +08:00
|
|
|
if (can_sample_identifier) {
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__reset_sample_bit(evsel, ID);
|
|
|
|
evsel__set_sample_bit(evsel, IDENTIFIER);
|
2013-08-27 16:23:09 +08:00
|
|
|
} else {
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, ID);
|
2013-08-27 16:23:09 +08:00
|
|
|
}
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.read_format |= PERF_FORMAT_ID;
|
2012-12-11 02:21:30 +08:00
|
|
|
}
|
|
|
|
|
2016-07-07 22:51:47 +08:00
|
|
|
/**
|
2020-04-30 21:51:16 +08:00
|
|
|
* evsel__is_function_event - Return whether given evsel is a function
|
2016-07-07 22:51:47 +08:00
|
|
|
* trace event
|
|
|
|
*
|
|
|
|
* @evsel - evsel selector to be tested
|
|
|
|
*
|
|
|
|
* Return %true if event is function trace event
|
|
|
|
*/
|
2020-04-30 21:51:16 +08:00
|
|
|
bool evsel__is_function_event(struct evsel *evsel)
|
2016-07-07 22:51:47 +08:00
|
|
|
{
|
|
|
|
#define FUNCTION_EVENT "ftrace:function"
|
|
|
|
|
|
|
|
return evsel->name &&
|
|
|
|
!strncmp(FUNCTION_EVENT, evsel->name, sizeof(FUNCTION_EVENT));
|
|
|
|
|
|
|
|
#undef FUNCTION_EVENT
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:53 +08:00
|
|
|
void evsel__init(struct evsel *evsel,
|
|
|
|
struct perf_event_attr *attr, int idx)
|
2011-01-19 07:41:45 +08:00
|
|
|
{
|
2019-07-21 19:24:29 +08:00
|
|
|
perf_evsel__init(&evsel->core, attr);
|
2011-01-19 07:41:45 +08:00
|
|
|
evsel->idx = idx;
|
2014-07-31 14:00:52 +08:00
|
|
|
evsel->tracking = !idx;
|
2012-11-29 14:38:29 +08:00
|
|
|
evsel->leader = evsel;
|
2013-11-13 00:58:49 +08:00
|
|
|
evsel->unit = "";
|
|
|
|
evsel->scale = 1.0;
|
perf evsel: Introduce per event max_events property
This simply adds the field to 'struct perf_evsel' and allows setting
it via the event parser, to test it lets trace trace:
First look at where in a function that receives an evsel we can put a probe
to read how evsel->max_events was setup:
# perf probe -x ~/bin/perf -L trace__event_handler
<trace__event_handler@/home/acme/git/perf/tools/perf/builtin-trace.c:0>
0 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
union perf_event *event __maybe_unused,
struct perf_sample *sample)
3 {
4 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
5 int callchain_ret = 0;
7 if (sample->callchain) {
8 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
9 if (callchain_ret == 0) {
10 if (callchain_cursor.nr < trace->min_stack)
11 goto out;
12 callchain_ret = 1;
}
}
See what variables we can probe at line 7:
# perf probe -x ~/bin/perf -V trace__event_handler:7
Available variables at trace__event_handler:7
@<trace__event_handler+89>
int callchain_ret
struct perf_evsel* evsel
struct perf_sample* sample
struct thread* thread
struct trace* trace
union perf_event* event
Add a probe at that line asking for evsel->max_events to be collected and named
as "max_events":
# perf probe -x ~/bin/perf trace__event_handler:7 'max_events=evsel->max_events'
Added new event:
probe_perf:trace__event_handler (on trace__event_handler:7 in /home/acme/bin/perf with max_events=evsel->max_events)
You can now use it in all perf tools, such as:
perf record -e probe_perf:trace__event_handler -aR sleep 1
Now use 'perf trace', here aliased to just 'trace' and trace trace, i.e.
the first 'trace' is tracing just that 'probe_perf:trace__event_handler' event,
while the traced trace is tracing all scheduler tracepoints, will stop at two
events (--max-events 2) and will just set evsel->max_events for all the sched
tracepoints to 9, we will see the output of both traces intermixed:
# trace -e *perf:*event_handler trace --max-events 2 -e sched:*/nr=9/
0.000 :0/0 sched:sched_waking:comm=rcu_sched pid=10 prio=120 target_cpu=000
0.009 :0/0 sched:sched_wakeup:comm=rcu_sched pid=10 prio=120 target_cpu=000
0.000 trace/23949 probe_perf:trace__event_handler:(48c34a) max_events=0x9
0.046 trace/23949 probe_perf:trace__event_handler:(48c34a) max_events=0x9
#
Now, if the traced trace sends its output to /dev/null, we'll see just
what the first level trace outputs: that evsel->max_events is indeed
being set to 9:
# trace -e *perf:*event_handler trace -o /dev/null --max-events 2 -e sched:*/nr=9/
0.000 trace/23961 probe_perf:trace__event_handler:(48c34a) max_events=0x9
0.030 trace/23961 probe_perf:trace__event_handler:(48c34a) max_events=0x9
#
Now that we can set evsel->max_events, we can go to the next step, honour that
per-event property in 'perf trace'.
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/n/tip-og00yasj276joem6e14l1eas@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2018-10-20 02:47:34 +08:00
|
|
|
evsel->max_events = ULONG_MAX;
|
2015-08-27 20:07:40 +08:00
|
|
|
evsel->evlist = NULL;
|
2019-07-16 03:22:57 +08:00
|
|
|
evsel->bpf_obj = NULL;
|
perf bpf: Attach eBPF filter to perf event
This is the final patch which makes basic BPF filter work. After
applying this patch, users are allowed to use BPF filter like:
# perf record --event ./hello_world.o ls
A bpf_fd field is appended to 'struct evsel', and setup during the
callback function add_bpf_event() for each 'probe_trace_event'.
PERF_EVENT_IOC_SET_BPF ioctl is used to attach eBPF program to a newly
created perf event. The file descriptor of the eBPF program is passed to
perf record using previous patches, and stored into evsel->bpf_fd.
It is possible that different perf event are created for one kprobe
events for different CPUs. In this case, when trying to call the ioctl,
EEXIST will be return. This patch doesn't treat it as an error.
Committer note:
The bpf proggie used so far:
__attribute__((section("fork=_do_fork"), used))
int fork(void *ctx)
{
return 0;
}
char _license[] __attribute__((section("license"), used)) = "GPL";
int _version __attribute__((section("version"), used)) = 0x40300;
failed to produce any samples, even with forks happening and it being
running in system wide mode.
That is because now the filter is being associated, and the code above
always returns zero, meaning that all forks will be probed but filtered
away ;-/
Change it to 'return 1;' instead and after that:
# trace --no-syscalls --event /tmp/foo.o
0.000 perf_bpf_probe:fork:(ffffffff8109be30))
2.333 perf_bpf_probe:fork:(ffffffff8109be30))
3.725 perf_bpf_probe:fork:(ffffffff8109be30))
4.550 perf_bpf_probe:fork:(ffffffff8109be30))
^C#
And it works with all tools, including 'perf trace'.
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kaixu Xia <xiakaixu@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1444826502-49291-8-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-10-14 20:41:18 +08:00
|
|
|
evsel->bpf_fd = -1;
|
2015-07-29 17:42:10 +08:00
|
|
|
INIT_LIST_HEAD(&evsel->config_terms);
|
2014-10-10 02:29:51 +08:00
|
|
|
perf_evsel__object.init(evsel);
|
2020-04-30 03:00:27 +08:00
|
|
|
evsel->sample_size = __evsel__sample_size(attr->sample_type);
|
2020-04-30 02:58:40 +08:00
|
|
|
evsel__calc_id_pos(evsel);
|
2015-07-10 15:36:09 +08:00
|
|
|
evsel->cmdline_group_boundary = false;
|
perf stat: Output JSON MetricExpr metric
Add generic infrastructure to perf stat to output ratios for
"MetricExpr" entries in the event lists. Many events are more useful as
ratios than in raw form, typically some count in relation to total
ticks.
Transfer the MetricExpr information from the alias to the evsel.
We mark the events that need to be collected for MetricExpr, and also
link the events using them with a pointer. The code is careful to always
prefer the right event in the same group to minimize multiplexing
errors. At the moment only a single relation is supported.
Then add a rblist to the stat shadow code that remembers stats based on
the cpu and context.
Then finally update and retrieve and print these values similarly to the
existing hardcoded perf metrics. We use the simple expression parser
added earlier to evaluate the expression.
Normally we just output the result without further commentary, but for
--metric-only this would lead to empty columns. So for this case use the
original event as description.
There is no attempt to automatically add the MetricExpr event, if it is
missing, however we suggest it to the user, because the user tool
doesn't have enough information to reliably construct a group that is
guaranteed to schedule. So we leave that to the user.
% perf stat -a -I 1000 -e '{unc_p_clockticks,unc_p_freq_max_os_cycles}'
1.000147889 800,085,181 unc_p_clockticks
1.000147889 93,126,241 unc_p_freq_max_os_cycles # 11.6
2.000448381 800,218,217 unc_p_clockticks
2.000448381 142,516,095 unc_p_freq_max_os_cycles # 17.8
3.000639852 800,243,057 unc_p_clockticks
3.000639852 162,292,689 unc_p_freq_max_os_cycles # 20.3
% perf stat -a -I 1000 -e '{unc_p_clockticks,unc_p_freq_max_os_cycles}' --metric-only
# time freq_max_os_cycles %
1.000127077 0.9
2.000301436 0.7
3.000456379 0.0
v2: Change from DivideBy to MetricExpr
v3: Use expr__ prefix. Support more than one other event.
v4: Update description
v5: Only print warning message once for multiple PMUs.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/20170320201711.14142-11-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-03-21 04:17:08 +08:00
|
|
|
evsel->metric_expr = NULL;
|
2017-03-21 04:17:10 +08:00
|
|
|
evsel->metric_name = NULL;
|
perf stat: Output JSON MetricExpr metric
Add generic infrastructure to perf stat to output ratios for
"MetricExpr" entries in the event lists. Many events are more useful as
ratios than in raw form, typically some count in relation to total
ticks.
Transfer the MetricExpr information from the alias to the evsel.
We mark the events that need to be collected for MetricExpr, and also
link the events using them with a pointer. The code is careful to always
prefer the right event in the same group to minimize multiplexing
errors. At the moment only a single relation is supported.
Then add a rblist to the stat shadow code that remembers stats based on
the cpu and context.
Then finally update and retrieve and print these values similarly to the
existing hardcoded perf metrics. We use the simple expression parser
added earlier to evaluate the expression.
Normally we just output the result without further commentary, but for
--metric-only this would lead to empty columns. So for this case use the
original event as description.
There is no attempt to automatically add the MetricExpr event, if it is
missing, however we suggest it to the user, because the user tool
doesn't have enough information to reliably construct a group that is
guaranteed to schedule. So we leave that to the user.
% perf stat -a -I 1000 -e '{unc_p_clockticks,unc_p_freq_max_os_cycles}'
1.000147889 800,085,181 unc_p_clockticks
1.000147889 93,126,241 unc_p_freq_max_os_cycles # 11.6
2.000448381 800,218,217 unc_p_clockticks
2.000448381 142,516,095 unc_p_freq_max_os_cycles # 17.8
3.000639852 800,243,057 unc_p_clockticks
3.000639852 162,292,689 unc_p_freq_max_os_cycles # 20.3
% perf stat -a -I 1000 -e '{unc_p_clockticks,unc_p_freq_max_os_cycles}' --metric-only
# time freq_max_os_cycles %
1.000127077 0.9
2.000301436 0.7
3.000456379 0.0
v2: Change from DivideBy to MetricExpr
v3: Use expr__ prefix. Support more than one other event.
v4: Update description
v5: Only print warning message once for multiple PMUs.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/20170320201711.14142-11-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-03-21 04:17:08 +08:00
|
|
|
evsel->metric_events = NULL;
|
|
|
|
evsel->collect_stat = false;
|
2018-03-06 22:04:43 +08:00
|
|
|
evsel->pmu_name = NULL;
|
2011-01-19 07:41:45 +08:00
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
struct evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx)
|
2011-01-04 02:39:04 +08:00
|
|
|
{
|
2019-07-21 19:23:51 +08:00
|
|
|
struct evsel *evsel = zalloc(perf_evsel__object.size);
|
2011-01-04 02:39:04 +08:00
|
|
|
|
2018-08-24 23:45:56 +08:00
|
|
|
if (!evsel)
|
|
|
|
return NULL;
|
2019-07-21 19:23:53 +08:00
|
|
|
evsel__init(evsel, attr, idx);
|
2011-01-04 02:39:04 +08:00
|
|
|
|
2020-04-30 21:51:16 +08:00
|
|
|
if (evsel__is_bpf_output(evsel)) {
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.sample_type |= (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME |
|
2016-04-01 21:26:42 +08:00
|
|
|
PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD),
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.sample_period = 1;
|
perf tools: Introduce bpf-output event
Commit a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
adds a helper to enable a BPF program to output data to a perf ring
buffer through a new type of perf event, PERF_COUNT_SW_BPF_OUTPUT. This
patch enables perf to create events of that type. Now a perf user can
use the following cmdline to receive output data from BPF programs:
# perf record -a -e bpf-output/no-inherit,name=evt/ \
-e ./test_bpf_output.c/map:channel.event=evt/ ls /
# perf script
perf 1560 [004] 347747.086295: evt: ffffffff811fd201 sys_write ...
perf 1560 [004] 347747.086300: evt: ffffffff811fd201 sys_write ...
perf 1560 [004] 347747.086315: evt: ffffffff811fd201 sys_write ...
...
Test result:
# cat test_bpf_output.c
/************************ BEGIN **************************/
#include <uapi/linux/bpf.h>
struct bpf_map_def {
unsigned int type;
unsigned int key_size;
unsigned int value_size;
unsigned int max_entries;
};
#define SEC(NAME) __attribute__((section(NAME), used))
static u64 (*ktime_get_ns)(void) =
(void *)BPF_FUNC_ktime_get_ns;
static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
(void *)BPF_FUNC_trace_printk;
static int (*get_smp_processor_id)(void) =
(void *)BPF_FUNC_get_smp_processor_id;
static int (*perf_event_output)(void *, struct bpf_map_def *, int, void *, unsigned long) =
(void *)BPF_FUNC_perf_event_output;
struct bpf_map_def SEC("maps") channel = {
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(u32),
.max_entries = __NR_CPUS__,
};
SEC("func_write=sys_write")
int func_write(void *ctx)
{
struct {
u64 ktime;
int cpuid;
} __attribute__((packed)) output_data;
char error_data[] = "Error: failed to output: %d\n";
output_data.cpuid = get_smp_processor_id();
output_data.ktime = ktime_get_ns();
int err = perf_event_output(ctx, &channel, get_smp_processor_id(),
&output_data, sizeof(output_data));
if (err)
trace_printk(error_data, sizeof(error_data), err);
return 0;
}
char _license[] SEC("license") = "GPL";
int _version SEC("version") = LINUX_VERSION_CODE;
/************************ END ***************************/
# perf record -a -e bpf-output/no-inherit,name=evt/ \
-e ./test_bpf_output.c/map:channel.event=evt/ ls /
# perf script | grep ls
ls 2242 [003] 347851.557563: evt: ffffffff811fd201 sys_write ...
ls 2242 [003] 347851.557571: evt: ffffffff811fd201 sys_write ...
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456132275-98875-11-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-02-22 17:10:37 +08:00
|
|
|
}
|
|
|
|
|
2020-04-30 21:51:16 +08:00
|
|
|
if (evsel__is_clock(evsel)) {
|
perf stat: Get rid of extra clock display function
There's no reason to have separate function to display clock events.
It's only purpose was to convert the nanosecond value into microseconds.
We do that now in generic code, if the unit and scale values are
properly set, which this patch do for clock events.
The output differs in the unit field being displayed in its columns
rather than having it added as a suffix of the event name. Plus the
value is rounded into 2 decimal numbers as for any other event.
Before:
# perf stat -e cpu-clock,task-clock -C 0 sleep 3
Performance counter stats for 'CPU(s) 0':
3001.123137 cpu-clock (msec) # 1.000 CPUs utilized
3001.133250 task-clock (msec) # 1.000 CPUs utilized
3.001159813 seconds time elapsed
Now:
# perf stat -e cpu-clock,task-clock -C 0 sleep 3
Performance counter stats for 'CPU(s) 0':
3,001.05 msec cpu-clock # 1.000 CPUs utilized
3,001.05 msec task-clock # 1.000 CPUs utilized
3.001077794 seconds time elapsed
There's a small difference in csv output, as we now output the unit
field, which was empty before. It's in the proper spot, so there's no
compatibility issue.
Before:
# perf stat -e cpu-clock,task-clock -C 0 -x, sleep 3
3001.065177,,cpu-clock,3001064187,100.00,1.000,CPUs utilized
3001.077085,,task-clock,3001077085,100.00,1.000,CPUs utilized
# perf stat -e cpu-clock,task-clock -C 0 -x, sleep 3
3000.80,msec,cpu-clock,3000799026,100.00,1.000,CPUs utilized
3000.80,msec,task-clock,3000799550,100.00,1.000,CPUs utilized
Add perf_evsel__is_clock to replace nsec_counter.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20180720110036.32251-2-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2018-07-20 19:00:34 +08:00
|
|
|
/*
|
|
|
|
* The evsel->unit points to static alias->unit
|
|
|
|
* so it's ok to use static string in here.
|
|
|
|
*/
|
|
|
|
static const char *unit = "msec";
|
|
|
|
|
|
|
|
evsel->unit = unit;
|
|
|
|
evsel->scale = 1e-6;
|
|
|
|
}
|
|
|
|
|
2011-01-04 02:39:04 +08:00
|
|
|
return evsel;
|
|
|
|
}
|
|
|
|
|
perf evsel: Fix attr.exclude_kernel setting for default cycles:p
Yet another fix for probing the max attr.precise_ip setting: it is not
enough settting attr.exclude_kernel for !root users, as they _can_
profile the kernel if the kernel.perf_event_paranoid sysctl is set to
-1, so check that as well.
Testing it:
As non root:
$ sysctl kernel.perf_event_paranoid
kernel.perf_event_paranoid = 2
$ perf record sleep 1
$ perf evlist -v
cycles:uppp: ..., exclude_kernel: 1, ... precise_ip: 3, ...
Now as non-root, but with kernel.perf_event_paranoid set set to the
most permissive value, -1:
$ sysctl kernel.perf_event_paranoid
kernel.perf_event_paranoid = -1
$ perf record sleep 1
$ perf evlist -v
cycles:ppp: ..., exclude_kernel: 0, ... precise_ip: 3, ...
$
I.e. non-root, default kernel.perf_event_paranoid: :uppp modifier = not allowed to sample the kernel,
non-root, most permissible kernel.perf_event_paranoid: :ppp = allowed to sample the kernel.
In both cases, use the highest available precision: attr.precise_ip = 3.
Reported-and-Tested-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: d37a36979077 ("perf evsel: Fix attr.exclude_kernel setting for default cycles:p")
Link: http://lkml.kernel.org/n/tip-nj2qkf75xsd6pw6hhjzfqqdx@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-09-23 02:41:44 +08:00
|
|
|
static bool perf_event_can_profile_kernel(void)
|
|
|
|
{
|
perf evsel: Kernel profiling is disallowed only when perf_event_paranoid > 1
Perf was too restrictive about sysctl kernel.perf_event_paranoid. The
kernel only disallows profiling when perf_event_paranoid > 1. Make perf
do the same.
Committer testing:
For a non-root user:
$ id
uid=1000(acme) gid=1000(acme) groups=1000(acme),10(wheel) context=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
$
Before:
We were restricting it to just userspace (:u suffix) even for a
workload started by the user:
$ perf record sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.001 MB perf.data (8 samples) ]
$ perf evlist
cycles:u
$ perf evlist -v
cycles:u: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, read_format: ID, disabled: 1, inherit: 1, exclude_kernel: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1
$ perf report --stdio
# To display the perf.data header info, please use --header/--header-only options.
#
# Total Lost Samples: 0
#
# Samples: 8 of event 'cycles:u'
# Event count (approx.): 1040396
#
# Overhead Command Shared Object Symbol
# ........ ....... ................ ......................
#
68.36% sleep libc-2.29.so [.] _dl_addr
27.33% sleep ld-2.29.so [.] dl_main
3.80% sleep ld-2.29.so [.] _dl_setup_hash
#
# (Tip: Order by the overhead of source file name and line number: perf report -s srcline)
#
$
$
After:
When the kernel allows profiling the kernel in that scenario:
$ perf record sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.023 MB perf.data (11 samples) ]
$ perf evlist
cycles
$ perf evlist -v
cycles: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1
$
$ perf report --stdio
# To display the perf.data header info, please use --header/--header-only options.
#
# Total Lost Samples: 0
#
# Samples: 11 of event 'cycles'
# Event count (approx.): 1601964
#
# Overhead Command Shared Object Symbol
# ........ ....... ................ ..........................
#
28.14% sleep [kernel.vmlinux] [k] __rb_erase_color
27.21% sleep [kernel.vmlinux] [k] unmap_page_range
27.20% sleep ld-2.29.so [.] __tunable_get_val
15.24% sleep [kernel.vmlinux] [k] thp_get_unmapped_area
1.96% perf [kernel.vmlinux] [k] perf_event_exec
0.22% perf [kernel.vmlinux] [k] native_sched_clock
0.02% perf [kernel.vmlinux] [k] intel_bts_enable_local
0.00% perf [kernel.vmlinux] [k] native_write_msr
#
# (Tip: Boolean options have negative forms, e.g.: perf report --no-children)
#
$
Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Igor Lubashev <ilubashe@akamai.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexey Budankov <alexey.budankov@linux.intel.com>
Cc: James Morris <jmorris@namei.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1566869956-7154-4-git-send-email-ilubashe@akamai.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2019-08-27 09:39:14 +08:00
|
|
|
return perf_event_paranoid_check(1);
|
perf evsel: Fix attr.exclude_kernel setting for default cycles:p
Yet another fix for probing the max attr.precise_ip setting: it is not
enough settting attr.exclude_kernel for !root users, as they _can_
profile the kernel if the kernel.perf_event_paranoid sysctl is set to
-1, so check that as well.
Testing it:
As non root:
$ sysctl kernel.perf_event_paranoid
kernel.perf_event_paranoid = 2
$ perf record sleep 1
$ perf evlist -v
cycles:uppp: ..., exclude_kernel: 1, ... precise_ip: 3, ...
Now as non-root, but with kernel.perf_event_paranoid set set to the
most permissive value, -1:
$ sysctl kernel.perf_event_paranoid
kernel.perf_event_paranoid = -1
$ perf record sleep 1
$ perf evlist -v
cycles:ppp: ..., exclude_kernel: 0, ... precise_ip: 3, ...
$
I.e. non-root, default kernel.perf_event_paranoid: :uppp modifier = not allowed to sample the kernel,
non-root, most permissible kernel.perf_event_paranoid: :ppp = allowed to sample the kernel.
In both cases, use the highest available precision: attr.precise_ip = 3.
Reported-and-Tested-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: d37a36979077 ("perf evsel: Fix attr.exclude_kernel setting for default cycles:p")
Link: http://lkml.kernel.org/n/tip-nj2qkf75xsd6pw6hhjzfqqdx@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-09-23 02:41:44 +08:00
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
struct evsel *perf_evsel__new_cycles(bool precise)
|
2016-07-29 05:33:20 +08:00
|
|
|
{
|
|
|
|
struct perf_event_attr attr = {
|
|
|
|
.type = PERF_TYPE_HARDWARE,
|
|
|
|
.config = PERF_COUNT_HW_CPU_CYCLES,
|
perf evsel: Fix attr.exclude_kernel setting for default cycles:p
Yet another fix for probing the max attr.precise_ip setting: it is not
enough settting attr.exclude_kernel for !root users, as they _can_
profile the kernel if the kernel.perf_event_paranoid sysctl is set to
-1, so check that as well.
Testing it:
As non root:
$ sysctl kernel.perf_event_paranoid
kernel.perf_event_paranoid = 2
$ perf record sleep 1
$ perf evlist -v
cycles:uppp: ..., exclude_kernel: 1, ... precise_ip: 3, ...
Now as non-root, but with kernel.perf_event_paranoid set set to the
most permissive value, -1:
$ sysctl kernel.perf_event_paranoid
kernel.perf_event_paranoid = -1
$ perf record sleep 1
$ perf evlist -v
cycles:ppp: ..., exclude_kernel: 0, ... precise_ip: 3, ...
$
I.e. non-root, default kernel.perf_event_paranoid: :uppp modifier = not allowed to sample the kernel,
non-root, most permissible kernel.perf_event_paranoid: :ppp = allowed to sample the kernel.
In both cases, use the highest available precision: attr.precise_ip = 3.
Reported-and-Tested-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: d37a36979077 ("perf evsel: Fix attr.exclude_kernel setting for default cycles:p")
Link: http://lkml.kernel.org/n/tip-nj2qkf75xsd6pw6hhjzfqqdx@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-09-23 02:41:44 +08:00
|
|
|
.exclude_kernel = !perf_event_can_profile_kernel(),
|
2016-07-29 05:33:20 +08:00
|
|
|
};
|
2019-07-21 19:23:51 +08:00
|
|
|
struct evsel *evsel;
|
2016-07-29 05:33:20 +08:00
|
|
|
|
|
|
|
event_attr_init(&attr);
|
2017-07-04 00:05:43 +08:00
|
|
|
|
|
|
|
if (!precise)
|
|
|
|
goto new_event;
|
2016-07-29 05:33:20 +08:00
|
|
|
|
perf evsel: Fix probing of precise_ip level for default cycles event
Since commit 18e7a45af91a ("perf/x86: Reject non sampling events with
precise_ip") returns -EINVAL for sys_perf_event_open() with an attribute
with (attr.precise_ip > 0 && attr.sample_period == 0), just like is done
in the routine used to probe the max precise level when no events were
passed to 'perf record' or 'perf top', i.e.:
perf_evsel__new_cycles()
perf_event_attr__set_max_precise_ip()
The x86 code, in x86_pmu_hw_config(), which is called all the way from
sys_perf_event_open() did, starting with the aforementioned commit:
/* There's no sense in having PEBS for non sampling events: */
if (!is_sampling_event(event))
return -EINVAL;
Which makes it fail for cycles:ppp, cycles:pp and cycles:p, always using
just the non precise cycles variant.
To make sure that this is the case, I tested it, before this patch,
with:
# perf probe -L x86_pmu_hw_config
<x86_pmu_hw_config@/home/acme/git/linux/arch/x86/events/core.c:0>
0 int x86_pmu_hw_config(struct perf_event *event)
1 {
2 if (event->attr.precise_ip) {
<SNIP>
17 if (event->attr.precise_ip > precise)
18 return -EOPNOTSUPP;
/* There's no sense in having PEBS for non sampling events: */
21 if (!is_sampling_event(event))
22 return -EINVAL;
}
<SNIP>
# perf probe x86_pmu_hw_config:22
Added new events:
probe:x86_pmu_hw_config (on x86_pmu_hw_config:22)
probe:x86_pmu_hw_config_1 (on x86_pmu_hw_config:22)
You can now use it in all perf tools, such as:
perf record -e probe:x86_pmu_hw_config_1 -aR sleep 1
# perf trace -e perf_event_open,probe:x86_pmu_hwconfig*/max-stack=16/ perf record usleep 1
0.000 ( 0.015 ms): perf/4150 perf_event_open(attr_uptr: 0x7ffebc8ba110, cpu: -1, group_fd: -1 ) ...
0.015 ( ): probe:x86_pmu_hw_config:(ffffffff9c0065e1))
x86_pmu_hw_config ([kernel.kallsyms])
hsw_hw_config ([kernel.kallsyms])
x86_pmu_event_init ([kernel.kallsyms])
perf_try_init_event ([kernel.kallsyms])
perf_event_alloc ([kernel.kallsyms])
SYSC_perf_event_open ([kernel.kallsyms])
sys_perf_event_open ([kernel.kallsyms])
do_syscall_64 ([kernel.kallsyms])
return_from_SYSCALL_64 ([kernel.kallsyms])
syscall (/usr/lib64/libc-2.24.so)
perf_event_attr__set_max_precise_ip (/home/acme/bin/perf)
perf_evsel__new_cycles (/home/acme/bin/perf)
perf_evlist__add_default (/home/acme/bin/perf)
cmd_record (/home/acme/bin/perf)
run_builtin (/home/acme/bin/perf)
handle_internal_command (/home/acme/bin/perf)
0.000 ( 0.021 ms): perf/4150 ... [continued]: perf_event_open()) = -1 EINVAL Invalid argument
0.023 ( 0.002 ms): perf/4150 perf_event_open(attr_uptr: 0x7ffebc8ba110, cpu: -1, group_fd: -1 ) ...
0.025 ( ): probe:x86_pmu_hw_config:(ffffffff9c0065e1))
x86_pmu_hw_config ([kernel.kallsyms])
hsw_hw_config ([kernel.kallsyms])
x86_pmu_event_init ([kernel.kallsyms])
perf_try_init_event ([kernel.kallsyms])
perf_event_alloc ([kernel.kallsyms])
SYSC_perf_event_open ([kernel.kallsyms])
sys_perf_event_open ([kernel.kallsyms])
do_syscall_64 ([kernel.kallsyms])
return_from_SYSCALL_64 ([kernel.kallsyms])
syscall (/usr/lib64/libc-2.24.so)
perf_event_attr__set_max_precise_ip (/home/acme/bin/perf)
perf_evsel__new_cycles (/home/acme/bin/perf)
perf_evlist__add_default (/home/acme/bin/perf)
cmd_record (/home/acme/bin/perf)
run_builtin (/home/acme/bin/perf)
handle_internal_command (/home/acme/bin/perf)
0.023 ( 0.004 ms): perf/4150 ... [continued]: perf_event_open()) = -1 EINVAL Invalid argument
0.028 ( 0.002 ms): perf/4150 perf_event_open(attr_uptr: 0x7ffebc8ba110, cpu: -1, group_fd: -1 ) ...
0.030 ( ): probe:x86_pmu_hw_config:(ffffffff9c0065e1))
x86_pmu_hw_config ([kernel.kallsyms])
hsw_hw_config ([kernel.kallsyms])
x86_pmu_event_init ([kernel.kallsyms])
perf_try_init_event ([kernel.kallsyms])
perf_event_alloc ([kernel.kallsyms])
SYSC_perf_event_open ([kernel.kallsyms])
sys_perf_event_open ([kernel.kallsyms])
do_syscall_64 ([kernel.kallsyms])
return_from_SYSCALL_64 ([kernel.kallsyms])
syscall (/usr/lib64/libc-2.24.so)
perf_event_attr__set_max_precise_ip (/home/acme/bin/perf)
perf_evsel__new_cycles (/home/acme/bin/perf)
perf_evlist__add_default (/home/acme/bin/perf)
cmd_record (/home/acme/bin/perf)
run_builtin (/home/acme/bin/perf)
handle_internal_command (/home/acme/bin/perf)
0.028 ( 0.004 ms): perf/4150 ... [continued]: perf_event_open()) = -1 EINVAL Invalid argument
41.018 ( 0.012 ms): perf/4150 perf_event_open(attr_uptr: 0x7ffebc8b5dd0, pid: -1, group_fd: -1, flags: FD_CLOEXEC) = 4
41.065 ( 0.011 ms): perf/4150 perf_event_open(attr_uptr: 0x3c7db78, pid: -1, group_fd: -1, flags: FD_CLOEXEC) = 4
41.080 ( 0.006 ms): perf/4150 perf_event_open(attr_uptr: 0x3c7db78, pid: -1, group_fd: -1, flags: FD_CLOEXEC) = 4
41.103 ( 0.010 ms): perf/4150 perf_event_open(attr_uptr: 0x3c4e748, pid: 4151 (perf), group_fd: -1, flags: FD_CLOEXEC) = 4
41.115 ( 0.006 ms): perf/4150 perf_event_open(attr_uptr: 0x3c4e748, pid: 4151 (perf), cpu: 1, group_fd: -1, flags: FD_CLOEXEC) = 5
41.122 ( 0.004 ms): perf/4150 perf_event_open(attr_uptr: 0x3c4e748, pid: 4151 (perf), cpu: 2, group_fd: -1, flags: FD_CLOEXEC) = 6
41.128 ( 0.008 ms): perf/4150 perf_event_open(attr_uptr: 0x3c4e748, pid: 4151 (perf), cpu: 3, group_fd: -1, flags: FD_CLOEXEC) = 8
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.017 MB perf.data (2 samples) ]
#
I.e. that return -EINVAL in x86_pmu_hw_config() is hit three times.
So fix it by just setting attr.sample_period
Now, after this patch:
# perf trace --max-stack=2 -e perf_event_open,probe:x86_pmu_hw_config* perf record usleep 1
[ perf record: Woken up 1 times to write data ]
0.000 ( 0.017 ms): perf/8469 perf_event_open(attr_uptr: 0x7ffe36c27d10, pid: -1, cpu: 3, group_fd: -1, flags: FD_CLOEXEC) = 4
syscall (/usr/lib64/libc-2.24.so)
perf_event_open_cloexec_flag (/home/acme/bin/perf)
0.050 ( 0.031 ms): perf/8469 perf_event_open(attr_uptr: 0x24ebb78, pid: -1, group_fd: -1, flags: FD_CLOEXEC) = 4
syscall (/usr/lib64/libc-2.24.so)
perf_evlist__config (/home/acme/bin/perf)
0.092 ( 0.040 ms): perf/8469 perf_event_open(attr_uptr: 0x24ebb78, pid: -1, group_fd: -1, flags: FD_CLOEXEC) = 4
syscall (/usr/lib64/libc-2.24.so)
perf_evlist__config (/home/acme/bin/perf)
0.143 ( 0.007 ms): perf/8469 perf_event_open(attr_uptr: 0x24bc748, cpu: -1, group_fd: -1 ) = 4
syscall (/usr/lib64/libc-2.24.so)
perf_event_attr__set_max_precise_ip (/home/acme/bin/perf)
0.161 ( 0.007 ms): perf/8469 perf_event_open(attr_uptr: 0x24bc748, pid: 8470 (perf), group_fd: -1, flags: FD_CLOEXEC) = 4
syscall (/usr/lib64/libc-2.24.so)
perf_evsel__open (/home/acme/bin/perf)
0.171 ( 0.005 ms): perf/8469 perf_event_open(attr_uptr: 0x24bc748, pid: 8470 (perf), cpu: 1, group_fd: -1, flags: FD_CLOEXEC) = 5
syscall (/usr/lib64/libc-2.24.so)
perf_evsel__open (/home/acme/bin/perf)
0.180 ( 0.007 ms): perf/8469 perf_event_open(attr_uptr: 0x24bc748, pid: 8470 (perf), cpu: 2, group_fd: -1, flags: FD_CLOEXEC) = 6
syscall (/usr/lib64/libc-2.24.so)
perf_evsel__open (/home/acme/bin/perf)
0.190 ( 0.005 ms): perf/8469 perf_event_open(attr_uptr: 0x24bc748, pid: 8470 (perf), cpu: 3, group_fd: -1, flags: FD_CLOEXEC) = 8
syscall (/usr/lib64/libc-2.24.so)
perf_evsel__open (/home/acme/bin/perf)
[ perf record: Captured and wrote 0.017 MB perf.data (7 samples) ]
#
The probe one called from perf_event_attr__set_max_precise_ip() works
the first time, with attr.precise_ip = 3, wit hthe next ones being the
per cpu ones for the cycles:ppp event.
And here is the text from a report and alternative proposed patch by
Thomas-Mich Richter:
---
On s390 the counter and sampling facility do not support a precise IP
skid level and sometimes returns EOPNOTSUPP when structure member
precise_ip in struct perf_event_attr is not set to zero.
On s390 commnd 'perf record -- true' fails with error EOPNOTSUPP. This
happens only when no events are specified on command line.
The functions called are
...
--> perf_evlist__add_default
--> perf_evsel__new_cycles
--> perf_event_attr__set_max_precise_ip
The last function determines the value of structure member precise_ip by
invoking the perf_event_open() system call and checking the return code.
The first successful open is the value for precise_ip.
However the value is determined without setting member sample_period and
indicates no sampling.
On s390 the counter facility and sampling facility are different. The
above procedure determines a precise_ip value of 3 using the counter
facility. Later it uses the sampling facility with a value of 3 and
fails with EOPNOTSUPP.
---
v2: Older compilers (e.g. gcc 4.4.7) don't support referencing members
of unnamed union members in the container struct initialization, so
move from:
struct perf_event_attr attr = {
...
.sample_period = 1,
};
to right after it as:
struct perf_event_attr attr = {
...
};
attr.sample_period = 1;
v3: We need to reset .sample_period to 0 to let the users of
perf_evsel__new_cycles() to properly setup attr.sample_period or
attr.sample_freq. Reported by Ingo Molnar.
Reported-and-Acked-by: Thomas-Mich Richter <tmricht@linux.vnet.ibm.com>
Acked-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: 18e7a45af91a ("perf/x86: Reject non sampling events with precise_ip")
Link: http://lkml.kernel.org/n/tip-yv6nnkl7tzqocrm0hl3x7vf1@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-06-10 03:54:28 +08:00
|
|
|
/*
|
|
|
|
* Now let the usual logic to set up the perf_event_attr defaults
|
|
|
|
* to kick in when we return and before perf_evsel__open() is called.
|
|
|
|
*/
|
2017-07-04 00:05:43 +08:00
|
|
|
new_event:
|
2019-07-21 19:23:58 +08:00
|
|
|
evsel = evsel__new(&attr);
|
2016-07-29 05:33:20 +08:00
|
|
|
if (evsel == NULL)
|
|
|
|
goto out;
|
|
|
|
|
2019-03-14 22:00:10 +08:00
|
|
|
evsel->precise_max = true;
|
|
|
|
|
2016-07-29 05:33:20 +08:00
|
|
|
/* use asprintf() because free(evsel) assumes name is allocated */
|
2017-07-11 03:19:25 +08:00
|
|
|
if (asprintf(&evsel->name, "cycles%s%s%.*s",
|
|
|
|
(attr.precise_ip || attr.exclude_kernel) ? ":" : "",
|
|
|
|
attr.exclude_kernel ? "u" : "",
|
|
|
|
attr.precise_ip ? attr.precise_ip + 1 : 0, "ppp") < 0)
|
2016-07-29 05:33:20 +08:00
|
|
|
goto error_free;
|
|
|
|
out:
|
|
|
|
return evsel;
|
|
|
|
error_free:
|
2019-07-21 19:23:57 +08:00
|
|
|
evsel__delete(evsel);
|
2016-07-29 05:33:20 +08:00
|
|
|
evsel = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2015-09-07 16:38:06 +08:00
|
|
|
/*
|
|
|
|
* Returns pointer with encoded error via <linux/err.h> interface.
|
|
|
|
*/
|
2019-07-21 19:23:51 +08:00
|
|
|
struct evsel *perf_evsel__newtp_idx(const char *sys, const char *name, int idx)
|
2012-09-18 22:21:50 +08:00
|
|
|
{
|
2019-07-21 19:23:51 +08:00
|
|
|
struct evsel *evsel = zalloc(perf_evsel__object.size);
|
2015-09-07 16:38:06 +08:00
|
|
|
int err = -ENOMEM;
|
2012-09-18 22:21:50 +08:00
|
|
|
|
2015-09-07 16:38:06 +08:00
|
|
|
if (evsel == NULL) {
|
|
|
|
goto out_err;
|
|
|
|
} else {
|
2012-09-18 22:21:50 +08:00
|
|
|
struct perf_event_attr attr = {
|
2012-09-26 23:28:26 +08:00
|
|
|
.type = PERF_TYPE_TRACEPOINT,
|
|
|
|
.sample_type = (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME |
|
|
|
|
PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD),
|
2012-09-18 22:21:50 +08:00
|
|
|
};
|
|
|
|
|
2012-09-27 04:11:38 +08:00
|
|
|
if (asprintf(&evsel->name, "%s:%s", sys, name) < 0)
|
|
|
|
goto out_free;
|
|
|
|
|
2013-12-03 21:09:24 +08:00
|
|
|
evsel->tp_format = trace_event__tp_format(sys, name);
|
2015-09-07 16:38:06 +08:00
|
|
|
if (IS_ERR(evsel->tp_format)) {
|
|
|
|
err = PTR_ERR(evsel->tp_format);
|
2012-09-18 22:21:50 +08:00
|
|
|
goto out_free;
|
2015-09-07 16:38:06 +08:00
|
|
|
}
|
2012-09-18 22:21:50 +08:00
|
|
|
|
2012-09-26 23:28:26 +08:00
|
|
|
event_attr_init(&attr);
|
2012-09-18 22:21:50 +08:00
|
|
|
attr.config = evsel->tp_format->id;
|
2012-09-26 23:28:26 +08:00
|
|
|
attr.sample_period = 1;
|
2019-07-21 19:23:53 +08:00
|
|
|
evsel__init(evsel, &attr, idx);
|
2012-09-18 22:21:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return evsel;
|
|
|
|
|
|
|
|
out_free:
|
2013-12-28 03:55:14 +08:00
|
|
|
zfree(&evsel->name);
|
2012-09-18 22:21:50 +08:00
|
|
|
free(evsel);
|
2015-09-07 16:38:06 +08:00
|
|
|
out_err:
|
|
|
|
return ERR_PTR(err);
|
2012-09-18 22:21:50 +08:00
|
|
|
}
|
|
|
|
|
2012-09-07 00:11:18 +08:00
|
|
|
const char *perf_evsel__hw_names[PERF_COUNT_HW_MAX] = {
|
2012-05-26 03:38:11 +08:00
|
|
|
"cycles",
|
|
|
|
"instructions",
|
|
|
|
"cache-references",
|
|
|
|
"cache-misses",
|
|
|
|
"branches",
|
|
|
|
"branch-misses",
|
|
|
|
"bus-cycles",
|
|
|
|
"stalled-cycles-frontend",
|
|
|
|
"stalled-cycles-backend",
|
|
|
|
"ref-cycles",
|
|
|
|
};
|
|
|
|
|
2020-04-30 03:07:09 +08:00
|
|
|
static const char *__evsel__hw_name(u64 config)
|
2012-05-26 03:38:11 +08:00
|
|
|
{
|
|
|
|
if (config < PERF_COUNT_HW_MAX && perf_evsel__hw_names[config])
|
|
|
|
return perf_evsel__hw_names[config];
|
|
|
|
|
|
|
|
return "unknown-hardware";
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
static int perf_evsel__add_modifiers(struct evsel *evsel, char *bf, size_t size)
|
2012-05-26 03:38:11 +08:00
|
|
|
{
|
2012-06-12 00:33:09 +08:00
|
|
|
int colon = 0, r = 0;
|
2019-07-21 19:24:29 +08:00
|
|
|
struct perf_event_attr *attr = &evsel->core.attr;
|
2012-05-26 03:38:11 +08:00
|
|
|
bool exclude_guest_default = false;
|
|
|
|
|
|
|
|
#define MOD_PRINT(context, mod) do { \
|
|
|
|
if (!attr->exclude_##context) { \
|
2012-06-12 00:33:09 +08:00
|
|
|
if (!colon) colon = ++r; \
|
2012-05-26 03:38:11 +08:00
|
|
|
r += scnprintf(bf + r, size - r, "%c", mod); \
|
|
|
|
} } while(0)
|
|
|
|
|
|
|
|
if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv) {
|
|
|
|
MOD_PRINT(kernel, 'k');
|
|
|
|
MOD_PRINT(user, 'u');
|
|
|
|
MOD_PRINT(hv, 'h');
|
|
|
|
exclude_guest_default = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (attr->precise_ip) {
|
|
|
|
if (!colon)
|
2012-06-12 00:33:09 +08:00
|
|
|
colon = ++r;
|
2012-05-26 03:38:11 +08:00
|
|
|
r += scnprintf(bf + r, size - r, "%.*s", attr->precise_ip, "ppp");
|
|
|
|
exclude_guest_default = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (attr->exclude_host || attr->exclude_guest == exclude_guest_default) {
|
|
|
|
MOD_PRINT(host, 'H');
|
|
|
|
MOD_PRINT(guest, 'G');
|
|
|
|
}
|
|
|
|
#undef MOD_PRINT
|
|
|
|
if (colon)
|
2012-06-12 00:33:09 +08:00
|
|
|
bf[colon - 1] = ':';
|
2012-05-26 03:38:11 +08:00
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2020-04-30 03:07:09 +08:00
|
|
|
static int evsel__hw_name(struct evsel *evsel, char *bf, size_t size)
|
2012-06-12 00:33:09 +08:00
|
|
|
{
|
2020-04-30 03:07:09 +08:00
|
|
|
int r = scnprintf(bf, size, "%s", __evsel__hw_name(evsel->core.attr.config));
|
2012-06-12 00:33:09 +08:00
|
|
|
return r + perf_evsel__add_modifiers(evsel, bf + r, size - r);
|
|
|
|
}
|
|
|
|
|
2012-09-07 00:11:18 +08:00
|
|
|
const char *perf_evsel__sw_names[PERF_COUNT_SW_MAX] = {
|
2012-06-12 01:36:20 +08:00
|
|
|
"cpu-clock",
|
|
|
|
"task-clock",
|
|
|
|
"page-faults",
|
|
|
|
"context-switches",
|
2012-09-07 00:11:18 +08:00
|
|
|
"cpu-migrations",
|
2012-06-12 01:36:20 +08:00
|
|
|
"minor-faults",
|
|
|
|
"major-faults",
|
|
|
|
"alignment-faults",
|
|
|
|
"emulation-faults",
|
2013-09-01 02:50:52 +08:00
|
|
|
"dummy",
|
2012-06-12 01:36:20 +08:00
|
|
|
};
|
|
|
|
|
2020-04-30 03:07:09 +08:00
|
|
|
static const char *__evsel__sw_name(u64 config)
|
2012-06-12 01:36:20 +08:00
|
|
|
{
|
|
|
|
if (config < PERF_COUNT_SW_MAX && perf_evsel__sw_names[config])
|
|
|
|
return perf_evsel__sw_names[config];
|
|
|
|
return "unknown-software";
|
|
|
|
}
|
|
|
|
|
2020-04-30 03:07:09 +08:00
|
|
|
static int evsel__sw_name(struct evsel *evsel, char *bf, size_t size)
|
2012-06-12 01:36:20 +08:00
|
|
|
{
|
2020-04-30 03:07:09 +08:00
|
|
|
int r = scnprintf(bf, size, "%s", __evsel__sw_name(evsel->core.attr.config));
|
2012-06-12 01:36:20 +08:00
|
|
|
return r + perf_evsel__add_modifiers(evsel, bf + r, size - r);
|
|
|
|
}
|
|
|
|
|
2020-04-30 03:07:09 +08:00
|
|
|
static int __evsel__bp_name(char *bf, size_t size, u64 addr, u64 type)
|
2012-06-29 05:18:49 +08:00
|
|
|
{
|
|
|
|
int r;
|
|
|
|
|
|
|
|
r = scnprintf(bf, size, "mem:0x%" PRIx64 ":", addr);
|
|
|
|
|
|
|
|
if (type & HW_BREAKPOINT_R)
|
|
|
|
r += scnprintf(bf + r, size - r, "r");
|
|
|
|
|
|
|
|
if (type & HW_BREAKPOINT_W)
|
|
|
|
r += scnprintf(bf + r, size - r, "w");
|
|
|
|
|
|
|
|
if (type & HW_BREAKPOINT_X)
|
|
|
|
r += scnprintf(bf + r, size - r, "x");
|
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2020-04-30 03:07:09 +08:00
|
|
|
static int evsel__bp_name(struct evsel *evsel, char *bf, size_t size)
|
2012-06-29 05:18:49 +08:00
|
|
|
{
|
2019-07-21 19:24:29 +08:00
|
|
|
struct perf_event_attr *attr = &evsel->core.attr;
|
2020-04-30 03:07:09 +08:00
|
|
|
int r = __evsel__bp_name(bf, size, attr->bp_addr, attr->bp_type);
|
2012-06-29 05:18:49 +08:00
|
|
|
return r + perf_evsel__add_modifiers(evsel, bf + r, size - r);
|
|
|
|
}
|
|
|
|
|
2012-06-12 01:08:07 +08:00
|
|
|
const char *perf_evsel__hw_cache[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_EVSEL__MAX_ALIASES] = {
|
|
|
|
{ "L1-dcache", "l1-d", "l1d", "L1-data", },
|
|
|
|
{ "L1-icache", "l1-i", "l1i", "L1-instruction", },
|
|
|
|
{ "LLC", "L2", },
|
|
|
|
{ "dTLB", "d-tlb", "Data-TLB", },
|
|
|
|
{ "iTLB", "i-tlb", "Instruction-TLB", },
|
|
|
|
{ "branch", "branches", "bpu", "btb", "bpc", },
|
|
|
|
{ "node", },
|
|
|
|
};
|
|
|
|
|
|
|
|
const char *perf_evsel__hw_cache_op[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_EVSEL__MAX_ALIASES] = {
|
|
|
|
{ "load", "loads", "read", },
|
|
|
|
{ "store", "stores", "write", },
|
|
|
|
{ "prefetch", "prefetches", "speculative-read", "speculative-load", },
|
|
|
|
};
|
|
|
|
|
|
|
|
const char *perf_evsel__hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX]
|
|
|
|
[PERF_EVSEL__MAX_ALIASES] = {
|
|
|
|
{ "refs", "Reference", "ops", "access", },
|
|
|
|
{ "misses", "miss", },
|
|
|
|
};
|
|
|
|
|
|
|
|
#define C(x) PERF_COUNT_HW_CACHE_##x
|
|
|
|
#define CACHE_READ (1 << C(OP_READ))
|
|
|
|
#define CACHE_WRITE (1 << C(OP_WRITE))
|
|
|
|
#define CACHE_PREFETCH (1 << C(OP_PREFETCH))
|
|
|
|
#define COP(x) (1 << x)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* cache operartion stat
|
|
|
|
* L1I : Read and prefetch only
|
|
|
|
* ITLB and BPU : Read-only
|
|
|
|
*/
|
|
|
|
static unsigned long perf_evsel__hw_cache_stat[C(MAX)] = {
|
|
|
|
[C(L1D)] = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH),
|
|
|
|
[C(L1I)] = (CACHE_READ | CACHE_PREFETCH),
|
|
|
|
[C(LL)] = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH),
|
|
|
|
[C(DTLB)] = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH),
|
|
|
|
[C(ITLB)] = (CACHE_READ),
|
|
|
|
[C(BPU)] = (CACHE_READ),
|
|
|
|
[C(NODE)] = (CACHE_READ | CACHE_WRITE | CACHE_PREFETCH),
|
|
|
|
};
|
|
|
|
|
2020-04-30 21:51:16 +08:00
|
|
|
bool evsel__is_cache_op_valid(u8 type, u8 op)
|
2012-06-12 01:08:07 +08:00
|
|
|
{
|
|
|
|
if (perf_evsel__hw_cache_stat[type] & COP(op))
|
|
|
|
return true; /* valid */
|
|
|
|
else
|
|
|
|
return false; /* invalid */
|
|
|
|
}
|
|
|
|
|
2020-04-30 03:07:09 +08:00
|
|
|
int __evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, char *bf, size_t size)
|
2012-06-12 01:08:07 +08:00
|
|
|
{
|
|
|
|
if (result) {
|
|
|
|
return scnprintf(bf, size, "%s-%s-%s", perf_evsel__hw_cache[type][0],
|
|
|
|
perf_evsel__hw_cache_op[op][0],
|
|
|
|
perf_evsel__hw_cache_result[result][0]);
|
|
|
|
}
|
|
|
|
|
|
|
|
return scnprintf(bf, size, "%s-%s", perf_evsel__hw_cache[type][0],
|
|
|
|
perf_evsel__hw_cache_op[op][1]);
|
|
|
|
}
|
|
|
|
|
2020-04-30 03:07:09 +08:00
|
|
|
static int __evsel__hw_cache_name(u64 config, char *bf, size_t size)
|
2012-06-12 01:08:07 +08:00
|
|
|
{
|
|
|
|
u8 op, result, type = (config >> 0) & 0xff;
|
|
|
|
const char *err = "unknown-ext-hardware-cache-type";
|
|
|
|
|
2016-08-19 03:30:28 +08:00
|
|
|
if (type >= PERF_COUNT_HW_CACHE_MAX)
|
2012-06-12 01:08:07 +08:00
|
|
|
goto out_err;
|
|
|
|
|
|
|
|
op = (config >> 8) & 0xff;
|
|
|
|
err = "unknown-ext-hardware-cache-op";
|
2016-08-19 03:30:28 +08:00
|
|
|
if (op >= PERF_COUNT_HW_CACHE_OP_MAX)
|
2012-06-12 01:08:07 +08:00
|
|
|
goto out_err;
|
|
|
|
|
|
|
|
result = (config >> 16) & 0xff;
|
|
|
|
err = "unknown-ext-hardware-cache-result";
|
2016-08-19 03:30:28 +08:00
|
|
|
if (result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
|
2012-06-12 01:08:07 +08:00
|
|
|
goto out_err;
|
|
|
|
|
|
|
|
err = "invalid-cache";
|
2020-04-30 21:51:16 +08:00
|
|
|
if (!evsel__is_cache_op_valid(type, op))
|
2012-06-12 01:08:07 +08:00
|
|
|
goto out_err;
|
|
|
|
|
2020-04-30 03:07:09 +08:00
|
|
|
return __evsel__hw_cache_type_op_res_name(type, op, result, bf, size);
|
2012-06-12 01:08:07 +08:00
|
|
|
out_err:
|
|
|
|
return scnprintf(bf, size, "%s", err);
|
|
|
|
}
|
|
|
|
|
2020-04-30 03:07:09 +08:00
|
|
|
static int evsel__hw_cache_name(struct evsel *evsel, char *bf, size_t size)
|
2012-06-12 01:08:07 +08:00
|
|
|
{
|
2020-04-30 03:07:09 +08:00
|
|
|
int ret = __evsel__hw_cache_name(evsel->core.attr.config, bf, size);
|
2012-06-12 01:08:07 +08:00
|
|
|
return ret + perf_evsel__add_modifiers(evsel, bf + ret, size - ret);
|
|
|
|
}
|
|
|
|
|
2020-04-30 03:07:09 +08:00
|
|
|
static int evsel__raw_name(struct evsel *evsel, char *bf, size_t size)
|
2012-06-13 22:53:37 +08:00
|
|
|
{
|
2019-07-21 19:24:29 +08:00
|
|
|
int ret = scnprintf(bf, size, "raw 0x%" PRIx64, evsel->core.attr.config);
|
2012-06-13 22:53:37 +08:00
|
|
|
return ret + perf_evsel__add_modifiers(evsel, bf + ret, size - ret);
|
|
|
|
}
|
|
|
|
|
2020-04-30 03:07:09 +08:00
|
|
|
static int evsel__tool_name(char *bf, size_t size)
|
2019-03-27 06:18:22 +08:00
|
|
|
{
|
|
|
|
int ret = scnprintf(bf, size, "duration_time");
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-04-30 03:07:09 +08:00
|
|
|
const char *evsel__name(struct evsel *evsel)
|
2012-06-12 21:29:12 +08:00
|
|
|
{
|
2012-06-12 23:34:58 +08:00
|
|
|
char bf[128];
|
2012-06-12 21:29:12 +08:00
|
|
|
|
2019-06-18 01:32:53 +08:00
|
|
|
if (!evsel)
|
|
|
|
goto out_unknown;
|
|
|
|
|
2012-06-12 23:34:58 +08:00
|
|
|
if (evsel->name)
|
|
|
|
return evsel->name;
|
2012-05-26 03:38:11 +08:00
|
|
|
|
2019-07-21 19:24:29 +08:00
|
|
|
switch (evsel->core.attr.type) {
|
2012-05-26 03:38:11 +08:00
|
|
|
case PERF_TYPE_RAW:
|
2020-04-30 03:07:09 +08:00
|
|
|
evsel__raw_name(evsel, bf, sizeof(bf));
|
2012-05-26 03:38:11 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
case PERF_TYPE_HARDWARE:
|
2020-04-30 03:07:09 +08:00
|
|
|
evsel__hw_name(evsel, bf, sizeof(bf));
|
2012-05-26 03:38:11 +08:00
|
|
|
break;
|
2012-06-12 01:08:07 +08:00
|
|
|
|
|
|
|
case PERF_TYPE_HW_CACHE:
|
2020-04-30 03:07:09 +08:00
|
|
|
evsel__hw_cache_name(evsel, bf, sizeof(bf));
|
2012-06-12 01:08:07 +08:00
|
|
|
break;
|
|
|
|
|
2012-06-12 01:36:20 +08:00
|
|
|
case PERF_TYPE_SOFTWARE:
|
2019-03-27 06:18:22 +08:00
|
|
|
if (evsel->tool_event)
|
2020-04-30 03:07:09 +08:00
|
|
|
evsel__tool_name(bf, sizeof(bf));
|
2019-03-27 06:18:22 +08:00
|
|
|
else
|
2020-04-30 03:07:09 +08:00
|
|
|
evsel__sw_name(evsel, bf, sizeof(bf));
|
2012-06-12 01:36:20 +08:00
|
|
|
break;
|
|
|
|
|
2012-06-12 21:29:12 +08:00
|
|
|
case PERF_TYPE_TRACEPOINT:
|
2012-06-12 23:34:58 +08:00
|
|
|
scnprintf(bf, sizeof(bf), "%s", "unknown tracepoint");
|
2012-06-12 21:29:12 +08:00
|
|
|
break;
|
|
|
|
|
2012-06-29 05:18:49 +08:00
|
|
|
case PERF_TYPE_BREAKPOINT:
|
2020-04-30 03:07:09 +08:00
|
|
|
evsel__bp_name(evsel, bf, sizeof(bf));
|
2012-06-29 05:18:49 +08:00
|
|
|
break;
|
|
|
|
|
2012-05-26 03:38:11 +08:00
|
|
|
default:
|
2012-08-17 03:10:18 +08:00
|
|
|
scnprintf(bf, sizeof(bf), "unknown attr type: %d",
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.type);
|
2012-06-12 21:29:12 +08:00
|
|
|
break;
|
2012-05-26 03:38:11 +08:00
|
|
|
}
|
|
|
|
|
2012-06-12 23:34:58 +08:00
|
|
|
evsel->name = strdup(bf);
|
|
|
|
|
2019-06-18 01:32:53 +08:00
|
|
|
if (evsel->name)
|
|
|
|
return evsel->name;
|
|
|
|
out_unknown:
|
|
|
|
return "unknown";
|
2012-05-26 03:38:11 +08:00
|
|
|
}
|
|
|
|
|
2020-04-30 03:07:09 +08:00
|
|
|
const char *evsel__group_name(struct evsel *evsel)
|
2013-01-22 17:09:44 +08:00
|
|
|
{
|
|
|
|
return evsel->group_name ?: "anon group";
|
|
|
|
}
|
|
|
|
|
2018-03-07 23:50:02 +08:00
|
|
|
/*
|
|
|
|
* Returns the group details for the specified leader,
|
|
|
|
* with following rules.
|
|
|
|
*
|
|
|
|
* For record -e '{cycles,instructions}'
|
|
|
|
* 'anon group { cycles:u, instructions:u }'
|
|
|
|
*
|
|
|
|
* For record -e 'cycles,instructions' and report --group
|
|
|
|
* 'cycles:u, instructions:u'
|
|
|
|
*/
|
2020-04-30 03:09:12 +08:00
|
|
|
int evsel__group_desc(struct evsel *evsel, char *buf, size_t size)
|
2013-01-22 17:09:44 +08:00
|
|
|
{
|
2018-03-07 23:50:02 +08:00
|
|
|
int ret = 0;
|
2019-07-21 19:23:51 +08:00
|
|
|
struct evsel *pos;
|
2020-04-30 03:07:09 +08:00
|
|
|
const char *group_name = evsel__group_name(evsel);
|
2013-01-22 17:09:44 +08:00
|
|
|
|
2018-03-07 23:50:02 +08:00
|
|
|
if (!evsel->forced_leader)
|
|
|
|
ret = scnprintf(buf, size, "%s { ", group_name);
|
2013-01-22 17:09:44 +08:00
|
|
|
|
2020-04-30 03:07:09 +08:00
|
|
|
ret += scnprintf(buf + ret, size - ret, "%s", evsel__name(evsel));
|
2013-01-22 17:09:44 +08:00
|
|
|
|
|
|
|
for_each_group_member(pos, evsel)
|
2020-04-30 03:07:09 +08:00
|
|
|
ret += scnprintf(buf + ret, size - ret, ", %s", evsel__name(pos));
|
2013-01-22 17:09:44 +08:00
|
|
|
|
2018-03-07 23:50:02 +08:00
|
|
|
if (!evsel->forced_leader)
|
|
|
|
ret += scnprintf(buf + ret, size - ret, " }");
|
2013-01-22 17:09:44 +08:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-04-30 02:57:01 +08:00
|
|
|
static void __evsel__config_callchain(struct evsel *evsel, struct record_opts *opts,
|
|
|
|
struct callchain_param *param)
|
2014-03-02 23:56:40 +08:00
|
|
|
{
|
2020-04-30 21:51:16 +08:00
|
|
|
bool function = evsel__is_function_event(evsel);
|
2019-07-21 19:24:29 +08:00
|
|
|
struct perf_event_attr *attr = &evsel->core.attr;
|
2014-03-02 23:56:40 +08:00
|
|
|
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, CALLCHAIN);
|
2014-03-02 23:56:40 +08:00
|
|
|
|
perf tools: Per event max-stack settings
The tooling counterpart, now it is possible to do:
# perf record -e sched:sched_switch/max-stack=10/ -e cycles/call-graph=dwarf,max-stack=4/ -e cpu-cycles/call-graph=dwarf,max-stack=1024/ usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.052 MB perf.data (5 samples) ]
# perf evlist -v
sched:sched_switch: type: 2, size: 112, config: 0x110, { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|RAW|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, sample_max_stack: 10
cycles/call-graph=dwarf,max-stack=4/: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|PERIOD|REGS_USER|STACK_USER|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1, exclude_guest: 1, exclude_callchain_user: 1, sample_regs_user: 0xff0fff, sample_stack_user: 8192, sample_max_stack: 4
cpu-cycles/call-graph=dwarf,max-stack=1024/: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|PERIOD|REGS_USER|STACK_USER|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1, exclude_guest: 1, exclude_callchain_user: 1, sample_regs_user: 0xff0fff, sample_stack_user: 8192, sample_max_stack: 1024
# Tip: use 'perf evlist --trace-fields' to show fields for tracepoint events
Using just /max-stack=N/ means /call-graph=fp,max-stack=N/, that should
be further configurable by means of some .perfconfig knob.
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/n/tip-kolmn1yo40p7jhswxwrc7rrd@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-04-29 06:03:42 +08:00
|
|
|
attr->sample_max_stack = param->max_stack;
|
|
|
|
|
2019-05-30 21:29:22 +08:00
|
|
|
if (opts->kernel_callchains)
|
|
|
|
attr->exclude_callchain_user = 1;
|
|
|
|
if (opts->user_callchains)
|
|
|
|
attr->exclude_callchain_kernel = 1;
|
2015-08-04 16:30:20 +08:00
|
|
|
if (param->record_mode == CALLCHAIN_LBR) {
|
2015-01-06 02:23:04 +08:00
|
|
|
if (!opts->branch_stack) {
|
|
|
|
if (attr->exclude_user) {
|
|
|
|
pr_warning("LBR callstack option is only available "
|
|
|
|
"to get user callchain information. "
|
|
|
|
"Falling back to framepointers.\n");
|
|
|
|
} else {
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, BRANCH_STACK);
|
2015-01-06 02:23:04 +08:00
|
|
|
attr->branch_sample_type = PERF_SAMPLE_BRANCH_USER |
|
2015-12-12 08:12:24 +08:00
|
|
|
PERF_SAMPLE_BRANCH_CALL_STACK |
|
|
|
|
PERF_SAMPLE_BRANCH_NO_CYCLES |
|
perf evsel: Support PERF_SAMPLE_BRANCH_HW_INDEX
A new branch sample type PERF_SAMPLE_BRANCH_HW_INDEX has been introduced
in latest kernel.
Enable HW_INDEX by default in LBR call stack mode.
If kernel doesn't support the sample type, switching it off.
Add HW_INDEX in attr_fprintf as well. User can check whether the branch
sample type is set via debug information or header.
Committer testing:
First collect some samples with LBR callchains, system wide, for a few
seconds:
# perf record --call-graph lbr -a sleep 5
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.625 MB perf.data (224 samples) ]
#
Now lets use 'perf evlist -v' to look at the branch_sample_type:
# perf evlist -v
cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|BRANCH_STACK, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, branch_sample_type: USER|CALL_STACK|NO_FLAGS|NO_CYCLES|HW_INDEX
#
So the machine has the kernel feature, and it was correctly added to
perf_event_attr.branch_sample_type, for the default 'cycles' event.
If we do it in another machine, where the kernel lacks the HW_INDEX
feature, we get:
# perf record --call-graph lbr -a sleep 2s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 1.690 MB perf.data (499 samples) ]
# perf evlist -v
cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|BRANCH_STACK, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, branch_sample_type: USER|CALL_STACK|NO_FLAGS|NO_CYCLES
#
No HW_INDEX in attr.branch_sample_type.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexey Budankov <alexey.budankov@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Pavel Gerasimov <pavel.gerasimov@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vitaly Slobodskoy <vitaly.slobodskoy@intel.com>
Link: http://lore.kernel.org/lkml/20200228163011.19358-3-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2020-02-29 00:30:01 +08:00
|
|
|
PERF_SAMPLE_BRANCH_NO_FLAGS |
|
|
|
|
PERF_SAMPLE_BRANCH_HW_INDEX;
|
2015-01-06 02:23:04 +08:00
|
|
|
}
|
|
|
|
} else
|
|
|
|
pr_warning("Cannot use LBR callstack with branch stack. "
|
|
|
|
"Falling back to framepointers.\n");
|
|
|
|
}
|
|
|
|
|
2015-08-04 16:30:20 +08:00
|
|
|
if (param->record_mode == CALLCHAIN_DWARF) {
|
2014-03-02 23:56:40 +08:00
|
|
|
if (!function) {
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, REGS_USER);
|
|
|
|
evsel__set_sample_bit(evsel, STACK_USER);
|
perf record: Allow mixing --user-regs with --call-graph=dwarf
When DWARF stacks were requested and at the same time that the user
specifies a register set using the --user-regs option the full register
context was being captured on samples:
$ perf record -g --call-graph dwarf,1024 --user-regs=IP,SP,BP -- stack_test2.g.O3
188143843893585 0x6b48 [0x4f8]: PERF_RECORD_SAMPLE(IP, 0x4002): 23828/23828: 0x401236 period: 1363819 addr: 0x7ffedbdd51ac
... FP chain: nr:0
... user regs: mask 0xff0fff ABI 64-bit
.... AX 0x53b
.... BX 0x7ffedbdd3cc0
.... CX 0xffffffff
.... DX 0x33d3a
.... SI 0x7f09b74c38d0
.... DI 0x0
.... BP 0x401260
.... SP 0x7ffedbdd3cc0
.... IP 0x401236
.... FLAGS 0x20a
.... CS 0x33
.... SS 0x2b
.... R8 0x7f09b74c3800
.... R9 0x7f09b74c2da0
.... R10 0xfffffffffffff3ce
.... R11 0x246
.... R12 0x401070
.... R13 0x7ffedbdd5db0
.... R14 0x0
.... R15 0x0
... ustack: size 1024, offset 0xe0
. data_src: 0x5080021
... thread: stack_test2.g.O:23828
...... dso: /root/abudanko/stacks/stack_test2.g.O3
I.e. the --user-regs=IP,SP,BP was being ignored, being overridden by the
needs of --call-graph=dwarf.
After applying the change in this patch the sample data contains the
user specified register, but making sure that at least the minimal set
of register needed for DWARF unwinding (DWARF_MINIMAL_REGS) is
requested.
The user is warned that DWARF unwinding may not work if extra registers
end up being needed.
-g call-graph dwarf,K full_regs
--user-regs=user_regs user_regs
-g call-graph dwarf,K --user-regs=user_regs user_regs + DWARF_MINIMAL_REGS
$ perf record -g --call-graph dwarf,1024 --user-regs=BP -- ls
WARNING: The use of --call-graph=dwarf may require all the user registers, specifying a subset with --user-regs may render DWARF unwinding unreliable, so the minimal registers set (IP, SP) is explicitly forced.
arch COPYING Documentation include Kbuild lbuild MAINTAINERS modules.builtin Module.symvers perf.data.old scripts System.map virt
block CREDITS drivers init Kconfig lib Makefile modules.builtin.modinfo net README security tools vmlinux
certs crypto fs ipc kernel LICENSES mm modules.order perf.data samples sound usr vmlinux.o
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.030 MB perf.data (10 samples) ]
188368474305373 0x5e40 [0x470]: PERF_RECORD_SAMPLE(IP, 0x4002): 23839/23839: 0x401236 period: 1260507 addr: 0x7ffd3d85e96c
... FP chain: nr:0
... user regs: mask 0x1c0 ABI 64-bit
.... BP 0x401260
.... SP 0x7ffd3d85cc20
.... IP 0x401236
... ustack: size 1024, offset 0x58
. data_src: 0x5080021
Committer notes:
Detected build failures on arches where PERF_REGS_ is not available,
such as debian:experimental-x-{mips,mips64,mipsel}, fedora 24 and 30 for
ARC uClibc and glibc, reported to Alexey that provided a patch moving
the DWARF_MINIMAL_REGS from evsel.c to util/perf_regs.h, where it is
guarded by an HAVE_PERF_REGS_SUPPORT ifdef.
Committer testing:
# perf record --user-regs=bp,ax -a sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 1.955 MB perf.data (1773 samples) ]
# perf script -F+uregs | grep AX: | head -5
perf 1719 [000] 181.272398: 1 cycles: ffffffffba06a7c4 native_write_msr+0x4 (/lib/modules/5.2.0-rc1+/build/vmlinux) ABI:2 AX:0xffffffffffffffda BP:0x7ffef828fb00
perf 1719 [000] 181.272402: 1 cycles: ffffffffba06a7c4 native_write_msr+0x4 (/lib/modules/5.2.0-rc1+/build/vmlinux) ABI:2 AX:0xffffffffffffffda BP:0x7ffef828fb00
perf 1719 [000] 181.272403: 8 cycles: ffffffffba06a7c4 native_write_msr+0x4 (/lib/modules/5.2.0-rc1+/build/vmlinux) ABI:2 AX:0xffffffffffffffda BP:0x7ffef828fb00
perf 1719 [000] 181.272405: 181 cycles: ffffffffba06a7c6 native_write_msr+0x6 (/lib/modules/5.2.0-rc1+/build/vmlinux) ABI:2 AX:0xffffffffffffffda BP:0x7ffef828fb00
perf 1719 [000] 181.272406: 4405 cycles: ffffffffba06a7c4 native_write_msr+0x4 (/lib/modules/5.2.0-rc1+/build/vmlinux) ABI:2 AX:0xffffffffffffffda BP:0x7ffef828fb00
# perf record --call-graph=dwarf --user-regs=bp,ax -a sleep 1
WARNING: The use of --call-graph=dwarf may require all the user registers, specifying a subset with --user-regs may render DWARF unwinding unreliable, so the minimal registers set (IP, SP) is explicitly forced.
[ perf record: Woken up 55 times to write data ]
[ perf record: Captured and wrote 24.184 MB perf.data (2841 samples) ]
[root@quaco ~]# perf script --hide-call-graph -F+uregs | grep AX: | head -5
perf 1729 [000] 211.268006: 1 cycles: ffffffffba06a7c4 native_write_msr+0x4 (/lib/modules/5.2.0-rc1+/build/vmlinux) ABI:2 AX:0xffffffffffffffda BP:0x7ffc8679abb0 SP:0x7ffc8679ab78 IP:0x7fa75223a0db
perf 1729 [000] 211.268014: 1 cycles: ffffffffba06a7c4 native_write_msr+0x4 (/lib/modules/5.2.0-rc1+/build/vmlinux) ABI:2 AX:0xffffffffffffffda BP:0x7ffc8679abb0 SP:0x7ffc8679ab78 IP:0x7fa75223a0db
perf 1729 [000] 211.268017: 5 cycles: ffffffffba06a7c4 native_write_msr+0x4 (/lib/modules/5.2.0-rc1+/build/vmlinux) ABI:2 AX:0xffffffffffffffda BP:0x7ffc8679abb0 SP:0x7ffc8679ab78 IP:0x7fa75223a0db
perf 1729 [000] 211.268020: 48 cycles: ffffffffba06a7c6 native_write_msr+0x6 (/lib/modules/5.2.0-rc1+/build/vmlinux) ABI:2 AX:0xffffffffffffffda BP:0x7ffc8679abb0 SP:0x7ffc8679ab78 IP:0x7fa75223a0db
perf 1729 [000] 211.268024: 490 cycles: ffffffffba00e471 intel_bts_enable_local+0x21 (/lib/modules/5.2.0-rc1+/build/vmlinux) ABI:2 AX:0xffffffffffffffda BP:0x7ffc8679abb0 SP:0x7ffc8679ab78 IP:0x7fa75223a0db
#
Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/e7fd37b1-af22-0d94-a0dc-5895e803bbfe@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2019-05-31 03:03:36 +08:00
|
|
|
if (opts->sample_user_regs && DWARF_MINIMAL_REGS != PERF_REGS_MASK) {
|
|
|
|
attr->sample_regs_user |= DWARF_MINIMAL_REGS;
|
|
|
|
pr_warning("WARNING: The use of --call-graph=dwarf may require all the user registers, "
|
|
|
|
"specifying a subset with --user-regs may render DWARF unwinding unreliable, "
|
|
|
|
"so the minimal registers set (IP, SP) is explicitly forced.\n");
|
|
|
|
} else {
|
|
|
|
attr->sample_regs_user |= PERF_REGS_MASK;
|
|
|
|
}
|
2015-08-04 16:30:20 +08:00
|
|
|
attr->sample_stack_user = param->dump_size;
|
2014-03-02 23:56:40 +08:00
|
|
|
attr->exclude_callchain_user = 1;
|
|
|
|
} else {
|
|
|
|
pr_info("Cannot use DWARF unwind for function trace event,"
|
|
|
|
" falling back to framepointers.\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (function) {
|
|
|
|
pr_info("Disabling user space callchains for function trace event.\n");
|
|
|
|
attr->exclude_callchain_user = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-30 02:57:01 +08:00
|
|
|
void evsel__config_callchain(struct evsel *evsel, struct record_opts *opts,
|
|
|
|
struct callchain_param *param)
|
2018-01-13 03:21:04 +08:00
|
|
|
{
|
|
|
|
if (param->enabled)
|
2020-04-30 02:57:01 +08:00
|
|
|
return __evsel__config_callchain(evsel, opts, param);
|
2018-01-13 03:21:04 +08:00
|
|
|
}
|
|
|
|
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
static void
|
2019-07-21 19:23:51 +08:00
|
|
|
perf_evsel__reset_callgraph(struct evsel *evsel,
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
struct callchain_param *param)
|
|
|
|
{
|
2019-07-21 19:24:29 +08:00
|
|
|
struct perf_event_attr *attr = &evsel->core.attr;
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__reset_sample_bit(evsel, CALLCHAIN);
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
if (param->record_mode == CALLCHAIN_LBR) {
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__reset_sample_bit(evsel, BRANCH_STACK);
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
attr->branch_sample_type &= ~(PERF_SAMPLE_BRANCH_USER |
|
perf evsel: Support PERF_SAMPLE_BRANCH_HW_INDEX
A new branch sample type PERF_SAMPLE_BRANCH_HW_INDEX has been introduced
in latest kernel.
Enable HW_INDEX by default in LBR call stack mode.
If kernel doesn't support the sample type, switching it off.
Add HW_INDEX in attr_fprintf as well. User can check whether the branch
sample type is set via debug information or header.
Committer testing:
First collect some samples with LBR callchains, system wide, for a few
seconds:
# perf record --call-graph lbr -a sleep 5
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.625 MB perf.data (224 samples) ]
#
Now lets use 'perf evlist -v' to look at the branch_sample_type:
# perf evlist -v
cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|BRANCH_STACK, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, branch_sample_type: USER|CALL_STACK|NO_FLAGS|NO_CYCLES|HW_INDEX
#
So the machine has the kernel feature, and it was correctly added to
perf_event_attr.branch_sample_type, for the default 'cycles' event.
If we do it in another machine, where the kernel lacks the HW_INDEX
feature, we get:
# perf record --call-graph lbr -a sleep 2s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 1.690 MB perf.data (499 samples) ]
# perf evlist -v
cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|BRANCH_STACK, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, branch_sample_type: USER|CALL_STACK|NO_FLAGS|NO_CYCLES
#
No HW_INDEX in attr.branch_sample_type.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexey Budankov <alexey.budankov@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Pavel Gerasimov <pavel.gerasimov@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vitaly Slobodskoy <vitaly.slobodskoy@intel.com>
Link: http://lore.kernel.org/lkml/20200228163011.19358-3-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2020-02-29 00:30:01 +08:00
|
|
|
PERF_SAMPLE_BRANCH_CALL_STACK |
|
|
|
|
PERF_SAMPLE_BRANCH_HW_INDEX);
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
}
|
|
|
|
if (param->record_mode == CALLCHAIN_DWARF) {
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__reset_sample_bit(evsel, REGS_USER);
|
|
|
|
evsel__reset_sample_bit(evsel, STACK_USER);
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
static void apply_config_terms(struct evsel *evsel,
|
2018-01-16 22:16:25 +08:00
|
|
|
struct record_opts *opts, bool track)
|
2015-07-29 17:42:10 +08:00
|
|
|
{
|
|
|
|
struct perf_evsel_config_term *term;
|
2015-08-04 16:30:19 +08:00
|
|
|
struct list_head *config_terms = &evsel->config_terms;
|
2019-07-21 19:24:29 +08:00
|
|
|
struct perf_event_attr *attr = &evsel->core.attr;
|
perf callchain: Fix attr.sample_max_stack setting
When setting the "dwarf" unwinder for a specific event and not
specifying the max-stack, the attr.sample_max_stack ended up using an
uninitialized callchain_param.max_stack, fix it by using designated
initializers for that callchain_param variable, zeroing all non
explicitely initialized struct members.
Here is what happened:
# perf trace -vv --no-syscalls --max-stack 4 -e probe_libc:inet_pton/call-graph=dwarf/ ping -6 -c 1 ::1
callchain: type DWARF
callchain: stack dump size 8192
perf_event_attr:
type 2
size 112
config 0x730
{ sample_period, sample_freq } 1
sample_type IP|TID|TIME|ADDR|CALLCHAIN|CPU|PERIOD|RAW|REGS_USER|STACK_USER|DATA_SRC
exclude_callchain_user 1
{ wakeup_events, wakeup_watermark } 1
sample_regs_user 0xff0fff
sample_stack_user 8192
sample_max_stack 50656
sys_perf_event_open failed, error -75
Value too large for defined data type
# perf trace -vv --no-syscalls --max-stack 4 -e probe_libc:inet_pton/call-graph=dwarf/ ping -6 -c 1 ::1
callchain: type DWARF
callchain: stack dump size 8192
perf_event_attr:
type 2
size 112
config 0x730
sample_type IP|TID|TIME|ADDR|CALLCHAIN|CPU|PERIOD|RAW|REGS_USER|STACK_USER|DATA_SRC
exclude_callchain_user 1
sample_regs_user 0xff0fff
sample_stack_user 8192
sample_max_stack 30448
sys_perf_event_open failed, error -75
Value too large for defined data type
#
Now the attr.sample_max_stack is set to zero and the above works as
expected:
# perf trace --no-syscalls --max-stack 4 -e probe_libc:inet_pton/call-graph=dwarf/ ping -6 -c 1 ::1
PING ::1(::1) 56 data bytes
64 bytes from ::1: icmp_seq=1 ttl=64 time=0.072 ms
--- ::1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.072/0.072/0.072/0.000 ms
0.000 probe_libc:inet_pton:(7feb7a998350))
__inet_pton (inlined)
gaih_inet.constprop.7 (/usr/lib64/libc-2.26.so)
__GI_getaddrinfo (inlined)
[0xffffaa39b6108f3f] (/usr/bin/ping)
#
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Hendrick Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Thomas Richter <tmricht@linux.vnet.ibm.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/n/tip-is9tramondqa9jlxxsgcm9iz@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2018-01-15 22:07:58 +08:00
|
|
|
/* callgraph default */
|
|
|
|
struct callchain_param param = {
|
|
|
|
.record_mode = callchain_param.record_mode,
|
|
|
|
};
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
u32 dump_size = 0;
|
perf tools: Per event max-stack settings
The tooling counterpart, now it is possible to do:
# perf record -e sched:sched_switch/max-stack=10/ -e cycles/call-graph=dwarf,max-stack=4/ -e cpu-cycles/call-graph=dwarf,max-stack=1024/ usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.052 MB perf.data (5 samples) ]
# perf evlist -v
sched:sched_switch: type: 2, size: 112, config: 0x110, { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|RAW|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, sample_max_stack: 10
cycles/call-graph=dwarf,max-stack=4/: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|PERIOD|REGS_USER|STACK_USER|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1, exclude_guest: 1, exclude_callchain_user: 1, sample_regs_user: 0xff0fff, sample_stack_user: 8192, sample_max_stack: 4
cpu-cycles/call-graph=dwarf,max-stack=1024/: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|PERIOD|REGS_USER|STACK_USER|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1, exclude_guest: 1, exclude_callchain_user: 1, sample_regs_user: 0xff0fff, sample_stack_user: 8192, sample_max_stack: 1024
# Tip: use 'perf evlist --trace-fields' to show fields for tracepoint events
Using just /max-stack=N/ means /call-graph=fp,max-stack=N/, that should
be further configurable by means of some .perfconfig knob.
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/n/tip-kolmn1yo40p7jhswxwrc7rrd@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-04-29 06:03:42 +08:00
|
|
|
int max_stack = 0;
|
|
|
|
const char *callgraph_buf = NULL;
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
|
2015-07-29 17:42:10 +08:00
|
|
|
list_for_each_entry(term, config_terms, list) {
|
|
|
|
switch (term->type) {
|
2015-07-29 17:42:11 +08:00
|
|
|
case PERF_EVSEL__CONFIG_TERM_PERIOD:
|
2017-10-21 04:27:55 +08:00
|
|
|
if (!(term->weak && opts->user_interval != ULLONG_MAX)) {
|
|
|
|
attr->sample_period = term->val.period;
|
|
|
|
attr->freq = 0;
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__reset_sample_bit(evsel, PERIOD);
|
2017-10-21 04:27:55 +08:00
|
|
|
}
|
2015-08-04 16:30:19 +08:00
|
|
|
break;
|
2015-08-09 14:45:23 +08:00
|
|
|
case PERF_EVSEL__CONFIG_TERM_FREQ:
|
2017-10-21 04:27:55 +08:00
|
|
|
if (!(term->weak && opts->user_freq != UINT_MAX)) {
|
|
|
|
attr->sample_freq = term->val.freq;
|
|
|
|
attr->freq = 1;
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, PERIOD);
|
2017-10-21 04:27:55 +08:00
|
|
|
}
|
2015-08-09 14:45:23 +08:00
|
|
|
break;
|
2015-08-04 16:30:19 +08:00
|
|
|
case PERF_EVSEL__CONFIG_TERM_TIME:
|
|
|
|
if (term->val.time)
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, TIME);
|
2015-08-04 16:30:19 +08:00
|
|
|
else
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__reset_sample_bit(evsel, TIME);
|
2015-08-04 16:30:19 +08:00
|
|
|
break;
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
case PERF_EVSEL__CONFIG_TERM_CALLGRAPH:
|
2020-01-17 13:52:50 +08:00
|
|
|
callgraph_buf = term->val.str;
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
break;
|
2016-10-13 05:02:06 +08:00
|
|
|
case PERF_EVSEL__CONFIG_TERM_BRANCH:
|
2020-01-17 13:52:50 +08:00
|
|
|
if (term->val.str && strcmp(term->val.str, "no")) {
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, BRANCH_STACK);
|
2020-01-17 13:52:50 +08:00
|
|
|
parse_branch_str(term->val.str,
|
2016-10-13 05:02:06 +08:00
|
|
|
&attr->branch_sample_type);
|
|
|
|
} else
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__reset_sample_bit(evsel, BRANCH_STACK);
|
2016-10-13 05:02:06 +08:00
|
|
|
break;
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
case PERF_EVSEL__CONFIG_TERM_STACK_USER:
|
|
|
|
dump_size = term->val.stack_user;
|
|
|
|
break;
|
perf tools: Per event max-stack settings
The tooling counterpart, now it is possible to do:
# perf record -e sched:sched_switch/max-stack=10/ -e cycles/call-graph=dwarf,max-stack=4/ -e cpu-cycles/call-graph=dwarf,max-stack=1024/ usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.052 MB perf.data (5 samples) ]
# perf evlist -v
sched:sched_switch: type: 2, size: 112, config: 0x110, { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|RAW|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, sample_max_stack: 10
cycles/call-graph=dwarf,max-stack=4/: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|PERIOD|REGS_USER|STACK_USER|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1, exclude_guest: 1, exclude_callchain_user: 1, sample_regs_user: 0xff0fff, sample_stack_user: 8192, sample_max_stack: 4
cpu-cycles/call-graph=dwarf,max-stack=1024/: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|PERIOD|REGS_USER|STACK_USER|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1, exclude_guest: 1, exclude_callchain_user: 1, sample_regs_user: 0xff0fff, sample_stack_user: 8192, sample_max_stack: 1024
# Tip: use 'perf evlist --trace-fields' to show fields for tracepoint events
Using just /max-stack=N/ means /call-graph=fp,max-stack=N/, that should
be further configurable by means of some .perfconfig knob.
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/n/tip-kolmn1yo40p7jhswxwrc7rrd@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-04-29 06:03:42 +08:00
|
|
|
case PERF_EVSEL__CONFIG_TERM_MAX_STACK:
|
|
|
|
max_stack = term->val.max_stack;
|
|
|
|
break;
|
perf evsel: Introduce per event max_events property
This simply adds the field to 'struct perf_evsel' and allows setting
it via the event parser, to test it lets trace trace:
First look at where in a function that receives an evsel we can put a probe
to read how evsel->max_events was setup:
# perf probe -x ~/bin/perf -L trace__event_handler
<trace__event_handler@/home/acme/git/perf/tools/perf/builtin-trace.c:0>
0 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
union perf_event *event __maybe_unused,
struct perf_sample *sample)
3 {
4 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
5 int callchain_ret = 0;
7 if (sample->callchain) {
8 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
9 if (callchain_ret == 0) {
10 if (callchain_cursor.nr < trace->min_stack)
11 goto out;
12 callchain_ret = 1;
}
}
See what variables we can probe at line 7:
# perf probe -x ~/bin/perf -V trace__event_handler:7
Available variables at trace__event_handler:7
@<trace__event_handler+89>
int callchain_ret
struct perf_evsel* evsel
struct perf_sample* sample
struct thread* thread
struct trace* trace
union perf_event* event
Add a probe at that line asking for evsel->max_events to be collected and named
as "max_events":
# perf probe -x ~/bin/perf trace__event_handler:7 'max_events=evsel->max_events'
Added new event:
probe_perf:trace__event_handler (on trace__event_handler:7 in /home/acme/bin/perf with max_events=evsel->max_events)
You can now use it in all perf tools, such as:
perf record -e probe_perf:trace__event_handler -aR sleep 1
Now use 'perf trace', here aliased to just 'trace' and trace trace, i.e.
the first 'trace' is tracing just that 'probe_perf:trace__event_handler' event,
while the traced trace is tracing all scheduler tracepoints, will stop at two
events (--max-events 2) and will just set evsel->max_events for all the sched
tracepoints to 9, we will see the output of both traces intermixed:
# trace -e *perf:*event_handler trace --max-events 2 -e sched:*/nr=9/
0.000 :0/0 sched:sched_waking:comm=rcu_sched pid=10 prio=120 target_cpu=000
0.009 :0/0 sched:sched_wakeup:comm=rcu_sched pid=10 prio=120 target_cpu=000
0.000 trace/23949 probe_perf:trace__event_handler:(48c34a) max_events=0x9
0.046 trace/23949 probe_perf:trace__event_handler:(48c34a) max_events=0x9
#
Now, if the traced trace sends its output to /dev/null, we'll see just
what the first level trace outputs: that evsel->max_events is indeed
being set to 9:
# trace -e *perf:*event_handler trace -o /dev/null --max-events 2 -e sched:*/nr=9/
0.000 trace/23961 probe_perf:trace__event_handler:(48c34a) max_events=0x9
0.030 trace/23961 probe_perf:trace__event_handler:(48c34a) max_events=0x9
#
Now that we can set evsel->max_events, we can go to the next step, honour that
per-event property in 'perf trace'.
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/n/tip-og00yasj276joem6e14l1eas@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2018-10-20 02:47:34 +08:00
|
|
|
case PERF_EVSEL__CONFIG_TERM_MAX_EVENTS:
|
|
|
|
evsel->max_events = term->val.max_events;
|
|
|
|
break;
|
perf tools: Enable pre-event inherit setting by config terms
This patch allows perf record setting event's attr.inherit bit by
config terms like:
# perf record -e cycles/no-inherit/ ...
# perf record -e cycles/inherit/ ...
So user can control inherit bit for each event separately.
In following example, a.out fork()s in main then do some complex
CPU intensive computations in both of its children.
Basic result with and without inherit:
# perf record -e cycles -e instructions ./a.out
[ perf record: Woken up 9 times to write data ]
[ perf record: Captured and wrote 2.205 MB perf.data (47920 samples) ]
# perf report --stdio
# ...
# Samples: 23K of event 'cycles'
# Event count (approx.): 23641752891
...
# Samples: 24K of event 'instructions'
# Event count (approx.): 30428312415
# perf record -i -e cycles -e instructions ./a.out
[ perf record: Woken up 5 times to write data ]
[ perf record: Captured and wrote 1.111 MB perf.data (24019 samples) ]
...
# Samples: 12K of event 'cycles'
# Event count (approx.): 11699501775
...
# Samples: 12K of event 'instructions'
# Event count (approx.): 15058023559
Cancel inherit for one event when globally enable:
# perf record -e cycles/no-inherit/ -e instructions ./a.out
[ perf record: Woken up 7 times to write data ]
[ perf record: Captured and wrote 1.660 MB perf.data (36004 samples) ]
...
# Samples: 12K of event 'cycles/no-inherit/'
# Event count (approx.): 11895759282
...
# Samples: 24K of event 'instructions'
# Event count (approx.): 30668000441
Enable inherit for one event when globally disable:
# perf record -i -e cycles/inherit/ -e instructions ./a.out
[ perf record: Woken up 7 times to write data ]
[ perf record: Captured and wrote 1.654 MB perf.data (35868 samples) ]
...
# Samples: 23K of event 'cycles/inherit/'
# Event count (approx.): 23285400229
...
# Samples: 11K of event 'instructions'
# Event count (approx.): 14969050259
Committer note:
One can check if the bit was set, in addition to seeing the result in
the perf.data file size as above by doing one of:
# perf record -e cycles -e instructions -a usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.911 MB perf.data (63 samples) ]
# perf evlist -v
cycles: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
instructions: size: 112, config: 0x1, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, freq: 1, sample_id_all: 1, exclude_guest: 1
#
So, the inherit bit was set in both, now, if we disable it globally using
--no-inherit:
# perf record --no-inherit -e cycles -e instructions -a usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.910 MB perf.data (56 samples) ]
# perf evlist -v
cycles: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, mmap: 1, comm: 1, freq: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
instructions: size: 112, config: 0x1, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, freq: 1, sample_id_all: 1, exclude_guest: 1
No inherit bit set, then disabling it and setting just on the cycles event:
# perf record --no-inherit -e cycles/inherit/ -e instructions -a usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.909 MB perf.data (48 samples) ]
# perf evlist -v
cycles/inherit/: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
instructions: size: 112, config: 0x1, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, freq: 1, sample_id_all: 1, exclude_guest: 1
#
We can see it as well in by using a more verbose level of debug messages in
the tool that sets up the perf_event_attr, 'perf record' in this case:
[root@zoo ~]# perf record -vv --no-inherit -e cycles/inherit/ -e instructions -a usleep 1
------------------------------------------------------------
perf_event_attr:
size 112
{ sample_period, sample_freq } 4000
sample_type IP|TID|TIME|ID|CPU|PERIOD
read_format ID
disabled 1
inherit 1
mmap 1
comm 1
freq 1
task 1
sample_id_all 1
exclude_guest 1
mmap2 1
comm_exec 1
------------------------------------------------------------
sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8
sys_perf_event_open: pid -1 cpu 1 group_fd -1 flags 0x8
sys_perf_event_open: pid -1 cpu 2 group_fd -1 flags 0x8
sys_perf_event_open: pid -1 cpu 3 group_fd -1 flags 0x8
------------------------------------------------------------
perf_event_attr:
size 112
config 0x1
{ sample_period, sample_freq } 4000
sample_type IP|TID|TIME|ID|CPU|PERIOD
read_format ID
disabled 1
freq 1
sample_id_all 1
exclude_guest 1
------------------------------------------------------------
sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8
<SNIP>
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1446029705-199659-2-git-send-email-wangnan0@huawei.com
[ s/u64/bool/ for the perf_evsel_config_term inherit field - jolsa]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-10-28 18:55:02 +08:00
|
|
|
case PERF_EVSEL__CONFIG_TERM_INHERIT:
|
|
|
|
/*
|
|
|
|
* attr->inherit should has already been set by
|
2020-04-30 02:57:01 +08:00
|
|
|
* evsel__config. If user explicitly set
|
perf tools: Enable pre-event inherit setting by config terms
This patch allows perf record setting event's attr.inherit bit by
config terms like:
# perf record -e cycles/no-inherit/ ...
# perf record -e cycles/inherit/ ...
So user can control inherit bit for each event separately.
In following example, a.out fork()s in main then do some complex
CPU intensive computations in both of its children.
Basic result with and without inherit:
# perf record -e cycles -e instructions ./a.out
[ perf record: Woken up 9 times to write data ]
[ perf record: Captured and wrote 2.205 MB perf.data (47920 samples) ]
# perf report --stdio
# ...
# Samples: 23K of event 'cycles'
# Event count (approx.): 23641752891
...
# Samples: 24K of event 'instructions'
# Event count (approx.): 30428312415
# perf record -i -e cycles -e instructions ./a.out
[ perf record: Woken up 5 times to write data ]
[ perf record: Captured and wrote 1.111 MB perf.data (24019 samples) ]
...
# Samples: 12K of event 'cycles'
# Event count (approx.): 11699501775
...
# Samples: 12K of event 'instructions'
# Event count (approx.): 15058023559
Cancel inherit for one event when globally enable:
# perf record -e cycles/no-inherit/ -e instructions ./a.out
[ perf record: Woken up 7 times to write data ]
[ perf record: Captured and wrote 1.660 MB perf.data (36004 samples) ]
...
# Samples: 12K of event 'cycles/no-inherit/'
# Event count (approx.): 11895759282
...
# Samples: 24K of event 'instructions'
# Event count (approx.): 30668000441
Enable inherit for one event when globally disable:
# perf record -i -e cycles/inherit/ -e instructions ./a.out
[ perf record: Woken up 7 times to write data ]
[ perf record: Captured and wrote 1.654 MB perf.data (35868 samples) ]
...
# Samples: 23K of event 'cycles/inherit/'
# Event count (approx.): 23285400229
...
# Samples: 11K of event 'instructions'
# Event count (approx.): 14969050259
Committer note:
One can check if the bit was set, in addition to seeing the result in
the perf.data file size as above by doing one of:
# perf record -e cycles -e instructions -a usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.911 MB perf.data (63 samples) ]
# perf evlist -v
cycles: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
instructions: size: 112, config: 0x1, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, freq: 1, sample_id_all: 1, exclude_guest: 1
#
So, the inherit bit was set in both, now, if we disable it globally using
--no-inherit:
# perf record --no-inherit -e cycles -e instructions -a usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.910 MB perf.data (56 samples) ]
# perf evlist -v
cycles: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, mmap: 1, comm: 1, freq: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
instructions: size: 112, config: 0x1, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, freq: 1, sample_id_all: 1, exclude_guest: 1
No inherit bit set, then disabling it and setting just on the cycles event:
# perf record --no-inherit -e cycles/inherit/ -e instructions -a usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.909 MB perf.data (48 samples) ]
# perf evlist -v
cycles/inherit/: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
instructions: size: 112, config: 0x1, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, freq: 1, sample_id_all: 1, exclude_guest: 1
#
We can see it as well in by using a more verbose level of debug messages in
the tool that sets up the perf_event_attr, 'perf record' in this case:
[root@zoo ~]# perf record -vv --no-inherit -e cycles/inherit/ -e instructions -a usleep 1
------------------------------------------------------------
perf_event_attr:
size 112
{ sample_period, sample_freq } 4000
sample_type IP|TID|TIME|ID|CPU|PERIOD
read_format ID
disabled 1
inherit 1
mmap 1
comm 1
freq 1
task 1
sample_id_all 1
exclude_guest 1
mmap2 1
comm_exec 1
------------------------------------------------------------
sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8
sys_perf_event_open: pid -1 cpu 1 group_fd -1 flags 0x8
sys_perf_event_open: pid -1 cpu 2 group_fd -1 flags 0x8
sys_perf_event_open: pid -1 cpu 3 group_fd -1 flags 0x8
------------------------------------------------------------
perf_event_attr:
size 112
config 0x1
{ sample_period, sample_freq } 4000
sample_type IP|TID|TIME|ID|CPU|PERIOD
read_format ID
disabled 1
freq 1
sample_id_all 1
exclude_guest 1
------------------------------------------------------------
sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8
<SNIP>
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1446029705-199659-2-git-send-email-wangnan0@huawei.com
[ s/u64/bool/ for the perf_evsel_config_term inherit field - jolsa]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-10-28 18:55:02 +08:00
|
|
|
* inherit using config terms, override global
|
|
|
|
* opt->no_inherit setting.
|
|
|
|
*/
|
|
|
|
attr->inherit = term->val.inherit ? 1 : 0;
|
|
|
|
break;
|
perf tools: Enable overwrite settings
This patch allows following config terms and option:
Globally setting events to overwrite;
# perf record --overwrite ...
Set specific events to be overwrite or no-overwrite.
# perf record --event cycles/overwrite/ ...
# perf record --event cycles/no-overwrite/ ...
Add missing config terms and update the config term array size because
the longest string length has changed.
For overwritable events, it automatically selects attr.write_backward
since perf requires it to be backward for reading.
Test result:
# perf record --overwrite -e syscalls:*enter_nanosleep* usleep 1
[ perf record: Woken up 2 times to write data ]
[ perf record: Captured and wrote 0.011 MB perf.data (1 samples) ]
# perf evlist -v
syscalls:sys_enter_nanosleep: type: 2, size: 112, config: 0x134, { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CPU|PERIOD|RAW, disabled: 1, inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, write_backward: 1
# Tip: use 'perf evlist --trace-fields' to show fields for tracepoint events
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nilay Vaish <nilayvaish@gmail.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1468485287-33422-14-git-send-email-wangnan0@huawei.com
Signed-off-by: He Kuang <hekuang@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-07-14 16:34:45 +08:00
|
|
|
case PERF_EVSEL__CONFIG_TERM_OVERWRITE:
|
|
|
|
attr->write_backward = term->val.overwrite ? 1 : 0;
|
|
|
|
break;
|
2017-10-21 04:27:54 +08:00
|
|
|
case PERF_EVSEL__CONFIG_TERM_DRV_CFG:
|
2018-01-11 04:46:51 +08:00
|
|
|
break;
|
2019-04-12 21:59:47 +08:00
|
|
|
case PERF_EVSEL__CONFIG_TERM_PERCORE:
|
|
|
|
break;
|
2019-08-06 16:46:05 +08:00
|
|
|
case PERF_EVSEL__CONFIG_TERM_AUX_OUTPUT:
|
|
|
|
attr->aux_output = term->val.aux_output ? 1 : 0;
|
|
|
|
break;
|
2019-11-15 20:42:17 +08:00
|
|
|
case PERF_EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE:
|
|
|
|
/* Already applied by auxtrace */
|
|
|
|
break;
|
2019-11-15 20:42:22 +08:00
|
|
|
case PERF_EVSEL__CONFIG_TERM_CFG_CHG:
|
|
|
|
break;
|
2015-07-29 17:42:10 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
|
|
|
|
/* User explicitly set per-event callgraph, clear the old setting and reset. */
|
perf tools: Per event max-stack settings
The tooling counterpart, now it is possible to do:
# perf record -e sched:sched_switch/max-stack=10/ -e cycles/call-graph=dwarf,max-stack=4/ -e cpu-cycles/call-graph=dwarf,max-stack=1024/ usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.052 MB perf.data (5 samples) ]
# perf evlist -v
sched:sched_switch: type: 2, size: 112, config: 0x110, { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|RAW|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, sample_max_stack: 10
cycles/call-graph=dwarf,max-stack=4/: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|PERIOD|REGS_USER|STACK_USER|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1, exclude_guest: 1, exclude_callchain_user: 1, sample_regs_user: 0xff0fff, sample_stack_user: 8192, sample_max_stack: 4
cpu-cycles/call-graph=dwarf,max-stack=1024/: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|PERIOD|REGS_USER|STACK_USER|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1, exclude_guest: 1, exclude_callchain_user: 1, sample_regs_user: 0xff0fff, sample_stack_user: 8192, sample_max_stack: 1024
# Tip: use 'perf evlist --trace-fields' to show fields for tracepoint events
Using just /max-stack=N/ means /call-graph=fp,max-stack=N/, that should
be further configurable by means of some .perfconfig knob.
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/n/tip-kolmn1yo40p7jhswxwrc7rrd@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-04-29 06:03:42 +08:00
|
|
|
if ((callgraph_buf != NULL) || (dump_size > 0) || max_stack) {
|
2018-01-16 22:16:25 +08:00
|
|
|
bool sample_address = false;
|
|
|
|
|
perf tools: Per event max-stack settings
The tooling counterpart, now it is possible to do:
# perf record -e sched:sched_switch/max-stack=10/ -e cycles/call-graph=dwarf,max-stack=4/ -e cpu-cycles/call-graph=dwarf,max-stack=1024/ usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.052 MB perf.data (5 samples) ]
# perf evlist -v
sched:sched_switch: type: 2, size: 112, config: 0x110, { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|RAW|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, sample_max_stack: 10
cycles/call-graph=dwarf,max-stack=4/: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|PERIOD|REGS_USER|STACK_USER|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1, exclude_guest: 1, exclude_callchain_user: 1, sample_regs_user: 0xff0fff, sample_stack_user: 8192, sample_max_stack: 4
cpu-cycles/call-graph=dwarf,max-stack=1024/: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|PERIOD|REGS_USER|STACK_USER|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1, exclude_guest: 1, exclude_callchain_user: 1, sample_regs_user: 0xff0fff, sample_stack_user: 8192, sample_max_stack: 1024
# Tip: use 'perf evlist --trace-fields' to show fields for tracepoint events
Using just /max-stack=N/ means /call-graph=fp,max-stack=N/, that should
be further configurable by means of some .perfconfig knob.
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/n/tip-kolmn1yo40p7jhswxwrc7rrd@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-04-29 06:03:42 +08:00
|
|
|
if (max_stack) {
|
|
|
|
param.max_stack = max_stack;
|
|
|
|
if (callgraph_buf == NULL)
|
|
|
|
callgraph_buf = "fp";
|
|
|
|
}
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
|
|
|
|
/* parse callgraph parameters */
|
|
|
|
if (callgraph_buf != NULL) {
|
2015-08-11 18:30:48 +08:00
|
|
|
if (!strcmp(callgraph_buf, "no")) {
|
|
|
|
param.enabled = false;
|
|
|
|
param.record_mode = CALLCHAIN_NONE;
|
|
|
|
} else {
|
|
|
|
param.enabled = true;
|
|
|
|
if (parse_callchain_record(callgraph_buf, ¶m)) {
|
|
|
|
pr_err("per-event callgraph setting for %s failed. "
|
|
|
|
"Apply callgraph global setting for it\n",
|
|
|
|
evsel->name);
|
|
|
|
return;
|
|
|
|
}
|
2018-01-16 22:16:25 +08:00
|
|
|
if (param.record_mode == CALLCHAIN_DWARF)
|
|
|
|
sample_address = true;
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (dump_size > 0) {
|
|
|
|
dump_size = round_up(dump_size, sizeof(u64));
|
|
|
|
param.dump_size = dump_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If global callgraph set, clear it */
|
|
|
|
if (callchain_param.enabled)
|
|
|
|
perf_evsel__reset_callgraph(evsel, &callchain_param);
|
|
|
|
|
|
|
|
/* set perf-event callgraph */
|
2018-01-16 22:16:25 +08:00
|
|
|
if (param.enabled) {
|
|
|
|
if (sample_address) {
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, ADDR);
|
|
|
|
evsel__set_sample_bit(evsel, DATA_SRC);
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.mmap_data = track;
|
2018-01-16 22:16:25 +08:00
|
|
|
}
|
2020-04-30 02:57:01 +08:00
|
|
|
evsel__config_callchain(evsel, opts, ¶m);
|
2018-01-16 22:16:25 +08:00
|
|
|
}
|
perf callchain: Per-event type selection support
This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per
event. This in term can reduce sampling overhead and the size of the
perf.data.
Here is an example.
perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1
perf evlist -v
cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112,
config: 0x3c, { sample_period, sample_freq }: 1000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1,
inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all:
1, exclude_guest: 1, mmap2: 1, comm_exec: 1
cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, {
sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID,
disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1,
exclude_guest: 1
Signed-off-by: Kan Liang <kan.liang@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-08-11 18:30:47 +08:00
|
|
|
}
|
2015-07-29 17:42:10 +08:00
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
static bool is_dummy_event(struct evsel *evsel)
|
2018-07-09 22:15:22 +08:00
|
|
|
{
|
2019-07-21 19:24:29 +08:00
|
|
|
return (evsel->core.attr.type == PERF_TYPE_SOFTWARE) &&
|
|
|
|
(evsel->core.attr.config == PERF_COUNT_SW_DUMMY);
|
2018-07-09 22:15:22 +08:00
|
|
|
}
|
|
|
|
|
2019-11-15 20:42:17 +08:00
|
|
|
struct perf_evsel_config_term *__perf_evsel__get_config_term(struct evsel *evsel,
|
|
|
|
enum evsel_term_type type)
|
|
|
|
{
|
|
|
|
struct perf_evsel_config_term *term, *found_term = NULL;
|
|
|
|
|
|
|
|
list_for_each_entry(term, &evsel->config_terms, list) {
|
|
|
|
if (term->type == type)
|
|
|
|
found_term = term;
|
|
|
|
}
|
|
|
|
|
|
|
|
return found_term;
|
|
|
|
}
|
|
|
|
|
2012-11-13 01:34:01 +08:00
|
|
|
/*
|
|
|
|
* The enable_on_exec/disabled value strategy:
|
|
|
|
*
|
|
|
|
* 1) For any type of traced program:
|
|
|
|
* - all independent events and group leaders are disabled
|
|
|
|
* - all group members are enabled
|
|
|
|
*
|
|
|
|
* Group members are ruled by group leaders. They need to
|
|
|
|
* be enabled, because the group scheduling relies on that.
|
|
|
|
*
|
|
|
|
* 2) For traced programs executed by perf:
|
|
|
|
* - all independent events and group leaders have
|
|
|
|
* enable_on_exec set
|
|
|
|
* - we don't specifically enable or disable any event during
|
|
|
|
* the record command
|
|
|
|
*
|
|
|
|
* Independent events and group leaders are initially disabled
|
|
|
|
* and get enabled by exec. Group members are ruled by group
|
|
|
|
* leaders as stated in 1).
|
|
|
|
*
|
|
|
|
* 3) For traced programs attached by perf (pid/tid):
|
|
|
|
* - we specifically enable or disable all events during
|
|
|
|
* the record command
|
|
|
|
*
|
|
|
|
* When attaching events to already running traced we
|
|
|
|
* enable/disable events specifically, as there's no
|
|
|
|
* initial traced exec call.
|
|
|
|
*/
|
2020-04-30 02:57:01 +08:00
|
|
|
void evsel__config(struct evsel *evsel, struct record_opts *opts,
|
|
|
|
struct callchain_param *callchain)
|
2011-11-09 00:41:57 +08:00
|
|
|
{
|
2019-07-21 19:23:51 +08:00
|
|
|
struct evsel *leader = evsel->leader;
|
2019-07-21 19:24:29 +08:00
|
|
|
struct perf_event_attr *attr = &evsel->core.attr;
|
2014-07-31 14:00:52 +08:00
|
|
|
int track = evsel->tracking;
|
2013-11-15 21:52:29 +08:00
|
|
|
bool per_cpu = opts->target.default_per_cpu && !opts->target.per_thread;
|
2011-11-09 00:41:57 +08:00
|
|
|
|
2012-12-14 00:13:07 +08:00
|
|
|
attr->sample_id_all = perf_missing_features.sample_id_all ? 0 : 1;
|
2011-11-09 00:41:57 +08:00
|
|
|
attr->inherit = !opts->no_inherit;
|
perf tools: Enable overwrite settings
This patch allows following config terms and option:
Globally setting events to overwrite;
# perf record --overwrite ...
Set specific events to be overwrite or no-overwrite.
# perf record --event cycles/overwrite/ ...
# perf record --event cycles/no-overwrite/ ...
Add missing config terms and update the config term array size because
the longest string length has changed.
For overwritable events, it automatically selects attr.write_backward
since perf requires it to be backward for reading.
Test result:
# perf record --overwrite -e syscalls:*enter_nanosleep* usleep 1
[ perf record: Woken up 2 times to write data ]
[ perf record: Captured and wrote 0.011 MB perf.data (1 samples) ]
# perf evlist -v
syscalls:sys_enter_nanosleep: type: 2, size: 112, config: 0x134, { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CPU|PERIOD|RAW, disabled: 1, inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, write_backward: 1
# Tip: use 'perf evlist --trace-fields' to show fields for tracepoint events
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nilay Vaish <nilayvaish@gmail.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1468485287-33422-14-git-send-email-wangnan0@huawei.com
Signed-off-by: He Kuang <hekuang@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-07-14 16:34:45 +08:00
|
|
|
attr->write_backward = opts->overwrite ? 1 : 0;
|
2011-11-09 00:41:57 +08:00
|
|
|
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, IP);
|
|
|
|
evsel__set_sample_bit(evsel, TID);
|
2011-11-09 00:41:57 +08:00
|
|
|
|
2012-10-10 23:39:03 +08:00
|
|
|
if (evsel->sample_read) {
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, READ);
|
2012-10-10 23:39:03 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We need ID even in case of single event, because
|
|
|
|
* PERF_SAMPLE_READ process ID specific data.
|
|
|
|
*/
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_id(evsel, false);
|
2012-10-10 23:39:03 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Apply group format only if we belong to group
|
|
|
|
* with more than one members.
|
|
|
|
*/
|
2019-07-21 19:24:46 +08:00
|
|
|
if (leader->core.nr_members > 1) {
|
2012-10-10 23:39:03 +08:00
|
|
|
attr->read_format |= PERF_FORMAT_GROUP;
|
|
|
|
attr->inherit = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-11-09 00:41:57 +08:00
|
|
|
/*
|
2014-06-09 13:43:37 +08:00
|
|
|
* We default some events to have a default interval. But keep
|
2011-11-09 00:41:57 +08:00
|
|
|
* it a weak assumption overridable by the user.
|
|
|
|
*/
|
2014-06-09 13:43:37 +08:00
|
|
|
if (!attr->sample_period || (opts->user_freq != UINT_MAX ||
|
2011-11-09 00:41:57 +08:00
|
|
|
opts->user_interval != ULLONG_MAX)) {
|
|
|
|
if (opts->freq) {
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, PERIOD);
|
2011-11-09 00:41:57 +08:00
|
|
|
attr->freq = 1;
|
|
|
|
attr->sample_freq = opts->freq;
|
|
|
|
} else {
|
|
|
|
attr->sample_period = opts->default_interval;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (opts->no_samples)
|
|
|
|
attr->sample_freq = 0;
|
|
|
|
|
2017-08-25 00:27:31 +08:00
|
|
|
if (opts->inherit_stat) {
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.read_format |=
|
2017-08-25 00:27:31 +08:00
|
|
|
PERF_FORMAT_TOTAL_TIME_ENABLED |
|
|
|
|
PERF_FORMAT_TOTAL_TIME_RUNNING |
|
|
|
|
PERF_FORMAT_ID;
|
2011-11-09 00:41:57 +08:00
|
|
|
attr->inherit_stat = 1;
|
2017-08-25 00:27:31 +08:00
|
|
|
}
|
2011-11-09 00:41:57 +08:00
|
|
|
|
|
|
|
if (opts->sample_address) {
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, ADDR);
|
2011-11-09 00:41:57 +08:00
|
|
|
attr->mmap_data = track;
|
|
|
|
}
|
|
|
|
|
2014-11-14 01:21:03 +08:00
|
|
|
/*
|
|
|
|
* We don't allow user space callchains for function trace
|
|
|
|
* event, due to issues with page faults while tracing page
|
|
|
|
* fault handler and its overall trickiness nature.
|
|
|
|
*/
|
2020-04-30 21:51:16 +08:00
|
|
|
if (evsel__is_function_event(evsel))
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.exclude_callchain_user = 1;
|
2014-11-14 01:21:03 +08:00
|
|
|
|
2016-04-12 05:15:29 +08:00
|
|
|
if (callchain && callchain->enabled && !evsel->no_aux_samples)
|
2020-04-30 02:57:01 +08:00
|
|
|
evsel__config_callchain(evsel, opts, callchain);
|
2012-08-07 21:20:47 +08:00
|
|
|
|
2014-09-24 19:48:39 +08:00
|
|
|
if (opts->sample_intr_regs) {
|
perf record: Add ability to name registers to record
This patch modifies the -I/--int-regs option to enablepassing the name
of the registers to sample on interrupt. Registers can be specified by
their symbolic names. For instance on x86, --intr-regs=ax,si.
The motivation is to reduce the size of the perf.data file and the
overhead of sampling by only collecting the registers useful to a
specific analysis. For instance, for value profiling, sampling only the
registers used to passed arguements to functions.
With no parameter, the --intr-regs still records all possible registers
based on the architecture.
To name registers, it is necessary to use the long form of the option,
i.e., --intr-regs:
$ perf record --intr-regs=si,di,r8,r9 .....
To record any possible registers:
$ perf record -I .....
$ perf report --intr-regs ...
To display the register, one can use perf report -D
To list the available registers:
$ perf record --intr-regs=\?
available registers: AX BX CX DX SI DI BP SP IP FLAGS CS SS R8 R9 R10 R11 R12 R13 R14 R15
Signed-off-by: Stephane Eranian <eranian@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1441039273-16260-4-git-send-email-eranian@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-09-01 00:41:12 +08:00
|
|
|
attr->sample_regs_intr = opts->sample_intr_regs;
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, REGS_INTR);
|
2014-09-24 19:48:39 +08:00
|
|
|
}
|
|
|
|
|
2017-09-06 01:00:28 +08:00
|
|
|
if (opts->sample_user_regs) {
|
|
|
|
attr->sample_regs_user |= opts->sample_user_regs;
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, REGS_USER);
|
2017-09-06 01:00:28 +08:00
|
|
|
}
|
|
|
|
|
2016-08-02 02:02:35 +08:00
|
|
|
if (target__has_cpu(&opts->target) || opts->sample_cpu)
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, CPU);
|
2011-11-09 00:41:57 +08:00
|
|
|
|
2014-07-31 14:45:04 +08:00
|
|
|
/*
|
2016-02-25 02:02:25 +08:00
|
|
|
* When the user explicitly disabled time don't force it here.
|
2014-07-31 14:45:04 +08:00
|
|
|
*/
|
|
|
|
if (opts->sample_time &&
|
|
|
|
(!perf_missing_features.sample_id_all &&
|
2015-07-06 19:51:01 +08:00
|
|
|
(!opts->no_inherit || target__has_cpu(&opts->target) || per_cpu ||
|
|
|
|
opts->sample_time_set)))
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, TIME);
|
2011-11-09 00:41:57 +08:00
|
|
|
|
2014-07-14 18:02:56 +08:00
|
|
|
if (opts->raw_samples && !evsel->no_aux_samples) {
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, TIME);
|
|
|
|
evsel__set_sample_bit(evsel, RAW);
|
|
|
|
evsel__set_sample_bit(evsel, CPU);
|
2011-11-09 00:41:57 +08:00
|
|
|
}
|
|
|
|
|
2013-01-24 23:10:37 +08:00
|
|
|
if (opts->sample_address)
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, DATA_SRC);
|
2013-01-24 23:10:37 +08:00
|
|
|
|
2017-08-30 01:11:08 +08:00
|
|
|
if (opts->sample_phys_addr)
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, PHYS_ADDR);
|
2017-08-30 01:11:08 +08:00
|
|
|
|
2014-01-15 04:52:14 +08:00
|
|
|
if (opts->no_buffering) {
|
2011-11-09 00:41:57 +08:00
|
|
|
attr->watermark = 0;
|
|
|
|
attr->wakeup_events = 1;
|
|
|
|
}
|
2014-07-14 18:02:56 +08:00
|
|
|
if (opts->branch_stack && !evsel->no_aux_samples) {
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, BRANCH_STACK);
|
2012-02-10 06:21:02 +08:00
|
|
|
attr->branch_sample_type = opts->branch_stack;
|
|
|
|
}
|
2011-11-09 00:41:57 +08:00
|
|
|
|
2013-01-24 23:10:29 +08:00
|
|
|
if (opts->sample_weight)
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, WEIGHT);
|
2013-01-24 23:10:29 +08:00
|
|
|
|
2015-01-29 16:06:46 +08:00
|
|
|
attr->task = track;
|
2013-08-21 18:10:25 +08:00
|
|
|
attr->mmap = track;
|
2014-05-30 22:49:42 +08:00
|
|
|
attr->mmap2 = track && !perf_missing_features.mmap2;
|
2013-08-21 18:10:25 +08:00
|
|
|
attr->comm = track;
|
2019-01-18 00:15:17 +08:00
|
|
|
attr->ksymbol = track && !perf_missing_features.ksymbol;
|
2019-08-27 06:31:06 +08:00
|
|
|
attr->bpf_event = track && !opts->no_bpf_event && !perf_missing_features.bpf;
|
2011-11-09 00:41:57 +08:00
|
|
|
|
perf tools: Add PERF_RECORD_NAMESPACES to include namespaces related info
Introduce a new option to record PERF_RECORD_NAMESPACES events emitted
by the kernel when fork, clone, setns or unshare are invoked. And update
perf-record documentation with the new option to record namespace
events.
Committer notes:
Combined it with a later patch to allow printing it via 'perf report -D'
and be able to test the feature introduced in this patch. Had to move
here also perf_ns__name(), that was introduced in another later patch.
Also used PRIu64 and PRIx64 to fix the build in some enfironments wrt:
util/event.c:1129:39: error: format '%lx' expects argument of type 'long unsigned int', but argument 6 has type 'long long unsigned int' [-Werror=format=]
ret += fprintf(fp, "%u/%s: %lu/0x%lx%s", idx
^
Testing it:
# perf record --namespaces -a
^C[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 1.083 MB perf.data (423 samples) ]
#
# perf report -D
<SNIP>
3 2028902078892 0x115140 [0xa0]: PERF_RECORD_NAMESPACES 14783/14783 - nr_namespaces: 7
[0/net: 3/0xf0000081, 1/uts: 3/0xeffffffe, 2/ipc: 3/0xefffffff, 3/pid: 3/0xeffffffc,
4/user: 3/0xeffffffd, 5/mnt: 3/0xf0000000, 6/cgroup: 3/0xeffffffb]
0x1151e0 [0x30]: event: 9
.
. ... raw event: size 48 bytes
. 0000: 09 00 00 00 02 00 30 00 c4 71 82 68 0c 7f 00 00 ......0..q.h....
. 0010: a9 39 00 00 a9 39 00 00 94 28 fe 63 d8 01 00 00 .9...9...(.c....
. 0020: 03 00 00 00 00 00 00 00 ce c4 02 00 00 00 00 00 ................
<SNIP>
NAMESPACES events: 1
<SNIP>
#
Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/148891930386.25309.18412039920746995488.stgit@hbathini.in.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-03-08 04:41:43 +08:00
|
|
|
if (opts->record_namespaces)
|
|
|
|
attr->namespaces = track;
|
|
|
|
|
2020-03-25 20:45:34 +08:00
|
|
|
if (opts->record_cgroup) {
|
|
|
|
attr->cgroup = track && !perf_missing_features.cgroup;
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, CGROUP);
|
2020-03-25 20:45:34 +08:00
|
|
|
}
|
|
|
|
|
2015-07-21 17:44:04 +08:00
|
|
|
if (opts->record_switch_events)
|
|
|
|
attr->context_switch = track;
|
|
|
|
|
2013-09-20 22:40:43 +08:00
|
|
|
if (opts->sample_transaction)
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, TRANSACTION);
|
2013-09-20 22:40:43 +08:00
|
|
|
|
2015-02-25 07:13:40 +08:00
|
|
|
if (opts->running_time) {
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.read_format |=
|
2015-02-25 07:13:40 +08:00
|
|
|
PERF_FORMAT_TOTAL_TIME_ENABLED |
|
|
|
|
PERF_FORMAT_TOTAL_TIME_RUNNING;
|
|
|
|
}
|
|
|
|
|
2012-11-13 01:34:01 +08:00
|
|
|
/*
|
|
|
|
* XXX see the function comment above
|
|
|
|
*
|
|
|
|
* Disabling only independent events or group leaders,
|
|
|
|
* keeping group members enabled.
|
|
|
|
*/
|
2020-04-30 21:51:16 +08:00
|
|
|
if (evsel__is_group_leader(evsel))
|
2012-11-13 01:34:01 +08:00
|
|
|
attr->disabled = 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Setting enable_on_exec for independent events and
|
|
|
|
* group leaders for traced executed by perf.
|
|
|
|
*/
|
2020-04-30 21:51:16 +08:00
|
|
|
if (target__none(&opts->target) && evsel__is_group_leader(evsel) &&
|
|
|
|
!opts->initial_delay)
|
2011-11-09 00:41:57 +08:00
|
|
|
attr->enable_on_exec = 1;
|
2014-07-14 18:02:57 +08:00
|
|
|
|
|
|
|
if (evsel->immediate) {
|
|
|
|
attr->disabled = 0;
|
|
|
|
attr->enable_on_exec = 0;
|
|
|
|
}
|
2015-03-31 06:19:31 +08:00
|
|
|
|
|
|
|
clockid = opts->clockid;
|
|
|
|
if (opts->use_clockid) {
|
|
|
|
attr->use_clockid = 1;
|
|
|
|
attr->clockid = opts->clockid;
|
|
|
|
}
|
2015-07-29 17:42:10 +08:00
|
|
|
|
perf tools: Introduce 'P' modifier to request max precision
The 'P' will cause the event to get maximum possible detected precise
level.
Following record:
$ perf record -e cycles:P ...
will detect maximum precise level for 'cycles' event and use it.
Commiter note:
Testing it:
$ perf record -e cycles:P usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.013 MB perf.data (9 samples) ]
$ perf evlist
cycles:P
$ perf evlist -v
cycles:P: size: 112, { sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|PERIOD, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1,
enable_on_exec: 1, task: 1, precise_ip: 2, sample_id_all: 1, mmap2: 1,
comm_exec: 1
$
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1444068369-20978-6-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-10-06 02:06:05 +08:00
|
|
|
if (evsel->precise_max)
|
2019-03-14 22:00:10 +08:00
|
|
|
attr->precise_ip = 3;
|
perf tools: Introduce 'P' modifier to request max precision
The 'P' will cause the event to get maximum possible detected precise
level.
Following record:
$ perf record -e cycles:P ...
will detect maximum precise level for 'cycles' event and use it.
Commiter note:
Testing it:
$ perf record -e cycles:P usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.013 MB perf.data (9 samples) ]
$ perf evlist
cycles:P
$ perf evlist -v
cycles:P: size: 112, { sample_period, sample_freq }: 4000, sample_type:
IP|TID|TIME|PERIOD, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1,
enable_on_exec: 1, task: 1, precise_ip: 2, sample_id_all: 1, mmap2: 1,
comm_exec: 1
$
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1444068369-20978-6-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-10-06 02:06:05 +08:00
|
|
|
|
2016-02-15 16:34:31 +08:00
|
|
|
if (opts->all_user) {
|
|
|
|
attr->exclude_kernel = 1;
|
|
|
|
attr->exclude_user = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (opts->all_kernel) {
|
|
|
|
attr->exclude_kernel = 0;
|
|
|
|
attr->exclude_user = 1;
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:24:38 +08:00
|
|
|
if (evsel->core.own_cpus || evsel->unit)
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.read_format |= PERF_FORMAT_ID;
|
2018-10-04 05:20:52 +08:00
|
|
|
|
2015-07-29 17:42:10 +08:00
|
|
|
/*
|
|
|
|
* Apply event specific term settings,
|
|
|
|
* it overloads any global configuration.
|
|
|
|
*/
|
2018-01-16 22:16:25 +08:00
|
|
|
apply_config_terms(evsel, opts, track);
|
2016-12-13 15:46:22 +08:00
|
|
|
|
|
|
|
evsel->ignore_missing_thread = opts->ignore_missing_thread;
|
perf record: Fix period option handling
Stephan reported we don't unset PERIOD sample type when --no-period is
specified. Adding the unset check and reset PERIOD if --no-period is
specified.
Committer notes:
Check the sample_type, it shouldn't have PERF_SAMPLE_PERIOD there when
--no-period is used.
Before:
# perf record --no-period sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.018 MB perf.data (7 samples) ]
# perf evlist -v
cycles:ppp: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
#
After:
[root@jouet ~]# perf record --no-period sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.019 MB perf.data (17 samples) ]
[root@jouet ~]# perf evlist -v
cycles:ppp: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
[root@jouet ~]#
Reported-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Stephane Eranian <eranian@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20180201083812.11359-3-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2018-02-01 16:38:11 +08:00
|
|
|
|
|
|
|
/* The --period option takes the precedence. */
|
|
|
|
if (opts->period_set) {
|
|
|
|
if (opts->period)
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__set_sample_bit(evsel, PERIOD);
|
perf record: Fix period option handling
Stephan reported we don't unset PERIOD sample type when --no-period is
specified. Adding the unset check and reset PERIOD if --no-period is
specified.
Committer notes:
Check the sample_type, it shouldn't have PERF_SAMPLE_PERIOD there when
--no-period is used.
Before:
# perf record --no-period sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.018 MB perf.data (7 samples) ]
# perf evlist -v
cycles:ppp: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
#
After:
[root@jouet ~]# perf record --no-period sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.019 MB perf.data (17 samples) ]
[root@jouet ~]# perf evlist -v
cycles:ppp: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
[root@jouet ~]#
Reported-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Stephane Eranian <eranian@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20180201083812.11359-3-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2018-02-01 16:38:11 +08:00
|
|
|
else
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__reset_sample_bit(evsel, PERIOD);
|
perf record: Fix period option handling
Stephan reported we don't unset PERIOD sample type when --no-period is
specified. Adding the unset check and reset PERIOD if --no-period is
specified.
Committer notes:
Check the sample_type, it shouldn't have PERF_SAMPLE_PERIOD there when
--no-period is used.
Before:
# perf record --no-period sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.018 MB perf.data (7 samples) ]
# perf evlist -v
cycles:ppp: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
#
After:
[root@jouet ~]# perf record --no-period sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.019 MB perf.data (17 samples) ]
[root@jouet ~]# perf evlist -v
cycles:ppp: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
[root@jouet ~]#
Reported-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Stephane Eranian <eranian@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20180201083812.11359-3-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2018-02-01 16:38:11 +08:00
|
|
|
}
|
2018-07-09 22:15:22 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For initial_delay, a dummy event is added implicitly.
|
|
|
|
* The software event will trigger -EOPNOTSUPP error out,
|
|
|
|
* if BRANCH_STACK bit is set.
|
|
|
|
*/
|
|
|
|
if (opts->initial_delay && is_dummy_event(evsel))
|
2020-04-30 03:12:15 +08:00
|
|
|
evsel__reset_sample_bit(evsel, BRANCH_STACK);
|
2011-11-09 00:41:57 +08:00
|
|
|
}
|
|
|
|
|
2020-04-30 03:19:05 +08:00
|
|
|
int evsel__set_filter(struct evsel *evsel, const char *filter)
|
2015-07-04 04:05:50 +08:00
|
|
|
{
|
|
|
|
char *new_filter = strdup(filter);
|
|
|
|
|
|
|
|
if (new_filter != NULL) {
|
|
|
|
free(evsel->filter);
|
|
|
|
evsel->filter = new_filter;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2020-04-30 03:19:05 +08:00
|
|
|
static int evsel__append_filter(struct evsel *evsel, const char *fmt, const char *filter)
|
2015-07-04 23:19:13 +08:00
|
|
|
{
|
|
|
|
char *new_filter;
|
|
|
|
|
|
|
|
if (evsel->filter == NULL)
|
2020-04-30 03:19:05 +08:00
|
|
|
return evsel__set_filter(evsel, filter);
|
2015-07-04 23:19:13 +08:00
|
|
|
|
2016-09-16 22:44:03 +08:00
|
|
|
if (asprintf(&new_filter, fmt, evsel->filter, filter) > 0) {
|
2015-07-04 23:19:13 +08:00
|
|
|
free(evsel->filter);
|
|
|
|
evsel->filter = new_filter;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2020-04-30 03:19:05 +08:00
|
|
|
int evsel__append_tp_filter(struct evsel *evsel, const char *filter)
|
2016-09-16 22:44:04 +08:00
|
|
|
{
|
2020-04-30 03:19:05 +08:00
|
|
|
return evsel__append_filter(evsel, "(%s) && (%s)", filter);
|
2016-09-16 22:44:04 +08:00
|
|
|
}
|
|
|
|
|
2020-04-30 03:19:05 +08:00
|
|
|
int evsel__append_addr_filter(struct evsel *evsel, const char *filter)
|
2016-09-16 22:44:05 +08:00
|
|
|
{
|
2020-04-30 03:19:05 +08:00
|
|
|
return evsel__append_filter(evsel, "%s,%s", filter);
|
2016-09-16 22:44:05 +08:00
|
|
|
}
|
|
|
|
|
2019-11-21 08:15:21 +08:00
|
|
|
/* Caller has to clear disabled after going through all CPUs. */
|
|
|
|
int evsel__enable_cpu(struct evsel *evsel, int cpu)
|
|
|
|
{
|
|
|
|
return perf_evsel__enable_cpu(&evsel->core, cpu);
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:24:02 +08:00
|
|
|
int evsel__enable(struct evsel *evsel)
|
2013-08-03 08:41:10 +08:00
|
|
|
{
|
2019-07-21 19:24:52 +08:00
|
|
|
int err = perf_evsel__enable(&evsel->core);
|
2018-10-20 20:04:41 +08:00
|
|
|
|
|
|
|
if (!err)
|
|
|
|
evsel->disabled = false;
|
|
|
|
return err;
|
2013-08-03 08:41:10 +08:00
|
|
|
}
|
|
|
|
|
2019-11-21 08:15:21 +08:00
|
|
|
/* Caller has to set disabled after going through all CPUs. */
|
|
|
|
int evsel__disable_cpu(struct evsel *evsel, int cpu)
|
|
|
|
{
|
|
|
|
return perf_evsel__disable_cpu(&evsel->core, cpu);
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:24:03 +08:00
|
|
|
int evsel__disable(struct evsel *evsel)
|
2015-12-03 17:06:41 +08:00
|
|
|
{
|
2019-07-21 19:24:52 +08:00
|
|
|
int err = perf_evsel__disable(&evsel->core);
|
2018-10-20 20:04:41 +08:00
|
|
|
/*
|
|
|
|
* We mark it disabled here so that tools that disable a event can
|
|
|
|
* ignore events after they disable it. I.e. the ring buffer may have
|
|
|
|
* already a few more events queued up before the kernel got the stop
|
|
|
|
* request.
|
|
|
|
*/
|
|
|
|
if (!err)
|
|
|
|
evsel->disabled = true;
|
|
|
|
|
|
|
|
return err;
|
2015-12-03 17:06:41 +08:00
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
static void perf_evsel__free_config_terms(struct evsel *evsel)
|
2015-07-29 17:42:10 +08:00
|
|
|
{
|
|
|
|
struct perf_evsel_config_term *term, *h;
|
|
|
|
|
|
|
|
list_for_each_entry_safe(term, h, &evsel->config_terms, list) {
|
2019-07-04 23:13:46 +08:00
|
|
|
list_del_init(&term->list);
|
2020-01-17 13:52:51 +08:00
|
|
|
if (term->free_str)
|
|
|
|
zfree(&term->val.str);
|
2015-07-29 17:42:10 +08:00
|
|
|
free(term);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-30 02:53:17 +08:00
|
|
|
void evsel__exit(struct evsel *evsel)
|
2011-01-04 02:39:04 +08:00
|
|
|
{
|
2019-07-21 19:24:22 +08:00
|
|
|
assert(list_empty(&evsel->core.node));
|
2015-08-27 20:07:40 +08:00
|
|
|
assert(evsel->evlist == NULL);
|
2019-03-19 03:41:28 +08:00
|
|
|
perf_evsel__free_counts(evsel);
|
2019-07-21 19:24:50 +08:00
|
|
|
perf_evsel__free_fd(&evsel->core);
|
2019-09-03 16:34:29 +08:00
|
|
|
perf_evsel__free_id(&evsel->core);
|
2015-07-29 17:42:10 +08:00
|
|
|
perf_evsel__free_config_terms(evsel);
|
2018-03-06 21:10:45 +08:00
|
|
|
cgroup__put(evsel->cgrp);
|
2019-07-21 19:24:37 +08:00
|
|
|
perf_cpu_map__put(evsel->core.cpus);
|
2019-07-21 19:24:38 +08:00
|
|
|
perf_cpu_map__put(evsel->core.own_cpus);
|
2019-07-21 19:24:39 +08:00
|
|
|
perf_thread_map__put(evsel->core.threads);
|
2014-10-17 00:25:01 +08:00
|
|
|
zfree(&evsel->group_name);
|
|
|
|
zfree(&evsel->name);
|
2020-03-15 01:03:56 +08:00
|
|
|
zfree(&evsel->pmu_name);
|
2014-10-10 02:29:51 +08:00
|
|
|
perf_evsel__object.fini(evsel);
|
2011-01-19 07:41:45 +08:00
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:57 +08:00
|
|
|
void evsel__delete(struct evsel *evsel)
|
2011-01-19 07:41:45 +08:00
|
|
|
{
|
2020-04-30 02:53:17 +08:00
|
|
|
evsel__exit(evsel);
|
2011-01-04 02:39:04 +08:00
|
|
|
free(evsel);
|
|
|
|
}
|
2011-01-04 03:45:52 +08:00
|
|
|
|
2020-04-30 02:47:38 +08:00
|
|
|
void evsel__compute_deltas(struct evsel *evsel, int cpu, int thread,
|
|
|
|
struct perf_counts_values *count)
|
2013-01-29 19:47:43 +08:00
|
|
|
{
|
|
|
|
struct perf_counts_values tmp;
|
|
|
|
|
|
|
|
if (!evsel->prev_raw_counts)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (cpu == -1) {
|
|
|
|
tmp = evsel->prev_raw_counts->aggr;
|
|
|
|
evsel->prev_raw_counts->aggr = *count;
|
|
|
|
} else {
|
2015-06-26 17:29:11 +08:00
|
|
|
tmp = *perf_counts(evsel->prev_raw_counts, cpu, thread);
|
|
|
|
*perf_counts(evsel->prev_raw_counts, cpu, thread) = *count;
|
2013-01-29 19:47:43 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
count->val = count->val - tmp.val;
|
|
|
|
count->ena = count->ena - tmp.ena;
|
|
|
|
count->run = count->run - tmp.run;
|
|
|
|
}
|
|
|
|
|
2014-11-21 17:31:06 +08:00
|
|
|
void perf_counts_values__scale(struct perf_counts_values *count,
|
|
|
|
bool scale, s8 *pscaled)
|
|
|
|
{
|
|
|
|
s8 scaled = 0;
|
|
|
|
|
|
|
|
if (scale) {
|
|
|
|
if (count->run == 0) {
|
|
|
|
scaled = -1;
|
|
|
|
count->val = 0;
|
|
|
|
} else if (count->run < count->ena) {
|
|
|
|
scaled = 1;
|
perf stat: Improve scaling
The multiplexing scaling in perf stat mysteriously adds 0.5 to the
value. This dates back to the original perf tool. Other scaling code
doesn't use that strange convention. Remove the extra 0.5.
Before:
$ perf stat -e 'cycles,cycles,cycles,cycles,cycles,cycles' grep -rq foo
Performance counter stats for 'grep -rq foo':
6,403,580 cycles (81.62%)
6,404,341 cycles (81.64%)
6,402,983 cycles (81.62%)
6,399,941 cycles (81.63%)
6,399,451 cycles (81.62%)
6,436,105 cycles (91.87%)
0.005843799 seconds time elapsed
0.002905000 seconds user
0.002902000 seconds sys
After:
$ perf stat -e 'cycles,cycles,cycles,cycles,cycles,cycles' grep -rq foo
Performance counter stats for 'grep -rq foo':
6,422,704 cycles (81.68%)
6,401,842 cycles (81.68%)
6,398,432 cycles (81.68%)
6,397,098 cycles (81.68%)
6,396,074 cycles (81.67%)
6,434,980 cycles (91.62%)
0.005884437 seconds time elapsed
0.003580000 seconds user
0.002356000 seconds sys
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
LPU-Reference: 20190314225002.30108-10-andi@firstfloor.org
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2019-03-15 06:50:02 +08:00
|
|
|
count->val = (u64)((double) count->val * count->ena / count->run);
|
2014-11-21 17:31:06 +08:00
|
|
|
}
|
2019-03-15 06:50:01 +08:00
|
|
|
}
|
2014-11-21 17:31:06 +08:00
|
|
|
|
|
|
|
if (pscaled)
|
|
|
|
*pscaled = scaled;
|
|
|
|
}
|
|
|
|
|
2017-07-26 20:02:05 +08:00
|
|
|
static int
|
2019-07-21 19:23:51 +08:00
|
|
|
perf_evsel__read_one(struct evsel *evsel, int cpu, int thread)
|
2017-07-26 20:02:05 +08:00
|
|
|
{
|
|
|
|
struct perf_counts_values *count = perf_counts(evsel->counts, cpu, thread);
|
|
|
|
|
2019-07-21 19:24:51 +08:00
|
|
|
return perf_evsel__read(&evsel->core, cpu, thread, count);
|
2017-07-26 20:02:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2019-07-21 19:23:51 +08:00
|
|
|
perf_evsel__set_count(struct evsel *counter, int cpu, int thread,
|
2017-07-26 20:02:05 +08:00
|
|
|
u64 val, u64 ena, u64 run)
|
|
|
|
{
|
|
|
|
struct perf_counts_values *count;
|
|
|
|
|
|
|
|
count = perf_counts(counter->counts, cpu, thread);
|
|
|
|
|
|
|
|
count->val = val;
|
|
|
|
count->ena = ena;
|
|
|
|
count->run = run;
|
2019-07-21 19:23:48 +08:00
|
|
|
|
|
|
|
perf_counts__set_loaded(counter->counts, cpu, thread, true);
|
2017-07-26 20:02:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2019-07-21 19:23:51 +08:00
|
|
|
perf_evsel__process_group_data(struct evsel *leader,
|
2017-07-26 20:02:05 +08:00
|
|
|
int cpu, int thread, u64 *data)
|
|
|
|
{
|
2019-07-21 19:24:29 +08:00
|
|
|
u64 read_format = leader->core.attr.read_format;
|
2017-07-26 20:02:05 +08:00
|
|
|
struct sample_read_value *v;
|
|
|
|
u64 nr, ena = 0, run = 0, i;
|
|
|
|
|
|
|
|
nr = *data++;
|
|
|
|
|
2019-07-21 19:24:46 +08:00
|
|
|
if (nr != (u64) leader->core.nr_members)
|
2017-07-26 20:02:05 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
|
|
|
|
ena = *data++;
|
|
|
|
|
|
|
|
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
|
|
|
|
run = *data++;
|
|
|
|
|
|
|
|
v = (struct sample_read_value *) data;
|
|
|
|
|
|
|
|
perf_evsel__set_count(leader, cpu, thread,
|
|
|
|
v[0].value, ena, run);
|
|
|
|
|
|
|
|
for (i = 1; i < nr; i++) {
|
2019-07-21 19:23:51 +08:00
|
|
|
struct evsel *counter;
|
2017-07-26 20:02:05 +08:00
|
|
|
|
|
|
|
counter = perf_evlist__id2evsel(leader->evlist, v[i].id);
|
|
|
|
if (!counter)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
perf_evsel__set_count(counter, cpu, thread,
|
|
|
|
v[i].value, ena, run);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2019-07-21 19:23:51 +08:00
|
|
|
perf_evsel__read_group(struct evsel *leader, int cpu, int thread)
|
2017-07-26 20:02:05 +08:00
|
|
|
{
|
2017-11-09 23:03:40 +08:00
|
|
|
struct perf_stat_evsel *ps = leader->stats;
|
2019-07-21 19:24:29 +08:00
|
|
|
u64 read_format = leader->core.attr.read_format;
|
2019-07-21 19:24:51 +08:00
|
|
|
int size = perf_evsel__read_size(&leader->core);
|
2017-07-26 20:02:05 +08:00
|
|
|
u64 *data = ps->group_data;
|
|
|
|
|
|
|
|
if (!(read_format & PERF_FORMAT_ID))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2020-04-30 21:51:16 +08:00
|
|
|
if (!evsel__is_group_leader(leader))
|
2017-07-26 20:02:05 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (!data) {
|
|
|
|
data = zalloc(size);
|
|
|
|
if (!data)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ps->group_data = data;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (FD(leader, cpu, thread) < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (readn(FD(leader, cpu, thread), data, size) <= 0)
|
|
|
|
return -errno;
|
|
|
|
|
|
|
|
return perf_evsel__process_group_data(leader, cpu, thread, data);
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
int perf_evsel__read_counter(struct evsel *evsel, int cpu, int thread)
|
2017-07-26 20:02:05 +08:00
|
|
|
{
|
2019-07-21 19:24:29 +08:00
|
|
|
u64 read_format = evsel->core.attr.read_format;
|
2017-07-26 20:02:05 +08:00
|
|
|
|
|
|
|
if (read_format & PERF_FORMAT_GROUP)
|
|
|
|
return perf_evsel__read_group(evsel, cpu, thread);
|
|
|
|
else
|
|
|
|
return perf_evsel__read_one(evsel, cpu, thread);
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
int __perf_evsel__read_on_cpu(struct evsel *evsel,
|
2011-01-04 03:45:52 +08:00
|
|
|
int cpu, int thread, bool scale)
|
|
|
|
{
|
|
|
|
struct perf_counts_values count;
|
|
|
|
size_t nv = scale ? 3 : 1;
|
|
|
|
|
|
|
|
if (FD(evsel, cpu, thread) < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2015-06-26 17:29:11 +08:00
|
|
|
if (evsel->counts == NULL && perf_evsel__alloc_counts(evsel, cpu + 1, thread + 1) < 0)
|
2011-01-04 10:13:17 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
|
2017-04-13 02:23:01 +08:00
|
|
|
if (readn(FD(evsel, cpu, thread), &count, nv * sizeof(u64)) <= 0)
|
2011-01-04 03:45:52 +08:00
|
|
|
return -errno;
|
|
|
|
|
2020-04-30 02:47:38 +08:00
|
|
|
evsel__compute_deltas(evsel, cpu, thread, &count);
|
2014-11-21 17:31:06 +08:00
|
|
|
perf_counts_values__scale(&count, scale, NULL);
|
2015-06-26 17:29:11 +08:00
|
|
|
*perf_counts(evsel->counts, cpu, thread) = count;
|
2011-01-04 03:45:52 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
static int get_group_fd(struct evsel *evsel, int cpu, int thread)
|
perf tools: Enable grouping logic for parsed events
This patch adds a functionality that allows to create event groups
based on the way they are specified on the command line. Adding
functionality to the '{}' group syntax introduced in earlier patch.
The current '--group/-g' option behaviour remains intact. If you
specify it for record/stat/top command, all the specified events
become members of a single group with the first event as a group
leader.
With the new '{}' group syntax you can create group like:
# perf record -e '{cycles,faults}' ls
resulting in single event group containing 'cycles' and 'faults'
events, with cycles event as group leader.
All groups are created with regards to threads and cpus. Thus
recording an event group within a 2 threads on server with
4 CPUs will create 8 separate groups.
Examples (first event in brackets is group leader):
# 1 group (cpu-clock,task-clock)
perf record --group -e cpu-clock,task-clock ls
perf record -e '{cpu-clock,task-clock}' ls
# 2 groups (cpu-clock,task-clock) (minor-faults,major-faults)
perf record -e '{cpu-clock,task-clock},{minor-faults,major-faults}' ls
# 1 group (cpu-clock,task-clock,minor-faults,major-faults)
perf record --group -e cpu-clock,task-clock -e minor-faults,major-faults ls
perf record -e '{cpu-clock,task-clock,minor-faults,major-faults}' ls
# 2 groups (cpu-clock,task-clock) (minor-faults,major-faults)
perf record -e '{cpu-clock,task-clock} -e '{minor-faults,major-faults}' \
-e instructions ls
# 1 group
# (cpu-clock,task-clock,minor-faults,major-faults,instructions)
perf record --group -e cpu-clock,task-clock \
-e minor-faults,major-faults -e instructions ls perf record -e
'{cpu-clock,task-clock,minor-faults,major-faults,instructions}' ls
It's possible to use standard event modifier for a group, which spans
over all events in the group and updates each event modifier settings,
for example:
# perf record -r '{faults:k,cache-references}:p'
resulting in ':kp' modifier being used for 'faults' and ':p' modifier
being used for 'cache-references' event.
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ulrich Drepper <drepper@gmail.com>
Link: http://lkml.kernel.org/n/tip-ho42u0wcr8mn1otkalqi13qp@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2012-08-08 18:22:36 +08:00
|
|
|
{
|
2019-07-21 19:23:51 +08:00
|
|
|
struct evsel *leader = evsel->leader;
|
perf tools: Enable grouping logic for parsed events
This patch adds a functionality that allows to create event groups
based on the way they are specified on the command line. Adding
functionality to the '{}' group syntax introduced in earlier patch.
The current '--group/-g' option behaviour remains intact. If you
specify it for record/stat/top command, all the specified events
become members of a single group with the first event as a group
leader.
With the new '{}' group syntax you can create group like:
# perf record -e '{cycles,faults}' ls
resulting in single event group containing 'cycles' and 'faults'
events, with cycles event as group leader.
All groups are created with regards to threads and cpus. Thus
recording an event group within a 2 threads on server with
4 CPUs will create 8 separate groups.
Examples (first event in brackets is group leader):
# 1 group (cpu-clock,task-clock)
perf record --group -e cpu-clock,task-clock ls
perf record -e '{cpu-clock,task-clock}' ls
# 2 groups (cpu-clock,task-clock) (minor-faults,major-faults)
perf record -e '{cpu-clock,task-clock},{minor-faults,major-faults}' ls
# 1 group (cpu-clock,task-clock,minor-faults,major-faults)
perf record --group -e cpu-clock,task-clock -e minor-faults,major-faults ls
perf record -e '{cpu-clock,task-clock,minor-faults,major-faults}' ls
# 2 groups (cpu-clock,task-clock) (minor-faults,major-faults)
perf record -e '{cpu-clock,task-clock} -e '{minor-faults,major-faults}' \
-e instructions ls
# 1 group
# (cpu-clock,task-clock,minor-faults,major-faults,instructions)
perf record --group -e cpu-clock,task-clock \
-e minor-faults,major-faults -e instructions ls perf record -e
'{cpu-clock,task-clock,minor-faults,major-faults,instructions}' ls
It's possible to use standard event modifier for a group, which spans
over all events in the group and updates each event modifier settings,
for example:
# perf record -r '{faults:k,cache-references}:p'
resulting in ':kp' modifier being used for 'faults' and ':p' modifier
being used for 'cache-references' event.
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ulrich Drepper <drepper@gmail.com>
Link: http://lkml.kernel.org/n/tip-ho42u0wcr8mn1otkalqi13qp@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2012-08-08 18:22:36 +08:00
|
|
|
int fd;
|
|
|
|
|
2020-04-30 21:51:16 +08:00
|
|
|
if (evsel__is_group_leader(evsel))
|
perf tools: Enable grouping logic for parsed events
This patch adds a functionality that allows to create event groups
based on the way they are specified on the command line. Adding
functionality to the '{}' group syntax introduced in earlier patch.
The current '--group/-g' option behaviour remains intact. If you
specify it for record/stat/top command, all the specified events
become members of a single group with the first event as a group
leader.
With the new '{}' group syntax you can create group like:
# perf record -e '{cycles,faults}' ls
resulting in single event group containing 'cycles' and 'faults'
events, with cycles event as group leader.
All groups are created with regards to threads and cpus. Thus
recording an event group within a 2 threads on server with
4 CPUs will create 8 separate groups.
Examples (first event in brackets is group leader):
# 1 group (cpu-clock,task-clock)
perf record --group -e cpu-clock,task-clock ls
perf record -e '{cpu-clock,task-clock}' ls
# 2 groups (cpu-clock,task-clock) (minor-faults,major-faults)
perf record -e '{cpu-clock,task-clock},{minor-faults,major-faults}' ls
# 1 group (cpu-clock,task-clock,minor-faults,major-faults)
perf record --group -e cpu-clock,task-clock -e minor-faults,major-faults ls
perf record -e '{cpu-clock,task-clock,minor-faults,major-faults}' ls
# 2 groups (cpu-clock,task-clock) (minor-faults,major-faults)
perf record -e '{cpu-clock,task-clock} -e '{minor-faults,major-faults}' \
-e instructions ls
# 1 group
# (cpu-clock,task-clock,minor-faults,major-faults,instructions)
perf record --group -e cpu-clock,task-clock \
-e minor-faults,major-faults -e instructions ls perf record -e
'{cpu-clock,task-clock,minor-faults,major-faults,instructions}' ls
It's possible to use standard event modifier for a group, which spans
over all events in the group and updates each event modifier settings,
for example:
# perf record -r '{faults:k,cache-references}:p'
resulting in ':kp' modifier being used for 'faults' and ':p' modifier
being used for 'cache-references' event.
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ulrich Drepper <drepper@gmail.com>
Link: http://lkml.kernel.org/n/tip-ho42u0wcr8mn1otkalqi13qp@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2012-08-08 18:22:36 +08:00
|
|
|
return -1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Leader must be already processed/open,
|
|
|
|
* if not it's a bug.
|
|
|
|
*/
|
2019-07-21 19:24:45 +08:00
|
|
|
BUG_ON(!leader->core.fd);
|
perf tools: Enable grouping logic for parsed events
This patch adds a functionality that allows to create event groups
based on the way they are specified on the command line. Adding
functionality to the '{}' group syntax introduced in earlier patch.
The current '--group/-g' option behaviour remains intact. If you
specify it for record/stat/top command, all the specified events
become members of a single group with the first event as a group
leader.
With the new '{}' group syntax you can create group like:
# perf record -e '{cycles,faults}' ls
resulting in single event group containing 'cycles' and 'faults'
events, with cycles event as group leader.
All groups are created with regards to threads and cpus. Thus
recording an event group within a 2 threads on server with
4 CPUs will create 8 separate groups.
Examples (first event in brackets is group leader):
# 1 group (cpu-clock,task-clock)
perf record --group -e cpu-clock,task-clock ls
perf record -e '{cpu-clock,task-clock}' ls
# 2 groups (cpu-clock,task-clock) (minor-faults,major-faults)
perf record -e '{cpu-clock,task-clock},{minor-faults,major-faults}' ls
# 1 group (cpu-clock,task-clock,minor-faults,major-faults)
perf record --group -e cpu-clock,task-clock -e minor-faults,major-faults ls
perf record -e '{cpu-clock,task-clock,minor-faults,major-faults}' ls
# 2 groups (cpu-clock,task-clock) (minor-faults,major-faults)
perf record -e '{cpu-clock,task-clock} -e '{minor-faults,major-faults}' \
-e instructions ls
# 1 group
# (cpu-clock,task-clock,minor-faults,major-faults,instructions)
perf record --group -e cpu-clock,task-clock \
-e minor-faults,major-faults -e instructions ls perf record -e
'{cpu-clock,task-clock,minor-faults,major-faults,instructions}' ls
It's possible to use standard event modifier for a group, which spans
over all events in the group and updates each event modifier settings,
for example:
# perf record -r '{faults:k,cache-references}:p'
resulting in ':kp' modifier being used for 'faults' and ':p' modifier
being used for 'cache-references' event.
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ulrich Drepper <drepper@gmail.com>
Link: http://lkml.kernel.org/n/tip-ho42u0wcr8mn1otkalqi13qp@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2012-08-08 18:22:36 +08:00
|
|
|
|
|
|
|
fd = FD(leader, cpu, thread);
|
|
|
|
BUG_ON(fd == -1);
|
|
|
|
|
|
|
|
return fd;
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
static void perf_evsel__remove_fd(struct evsel *pos,
|
perf evsel: Enable ignore_missing_thread for pid option
While monitoring a multithread process with pid option, perf sometimes
may return sys_perf_event_open failure with 3(No such process) if any of
the process's threads die before we open the event. However, we want
perf continue monitoring the remaining threads and do not exit with
error.
Here, the patch enables perf_evsel::ignore_missing_thread for -p option
to ignore complete failure if any of threads die before we open the event.
But it may still return sys_perf_event_open failure with 22(Invalid) if we
monitors several event groups.
sys_perf_event_open: pid 28960 cpu 40 group_fd 118202 flags 0x8
sys_perf_event_open: pid 28961 cpu 40 group_fd 118203 flags 0x8
WARNING: Ignored open failure for pid 28962
sys_perf_event_open: pid 28962 cpu 40 group_fd [118203] flags 0x8
sys_perf_event_open failed, error -22
That is because when we ignore a missing thread, we change the thread_idx
without dealing with its fds, FD(evsel, cpu, thread). Then get_group_fd()
may return a wrong group_fd for the next thread and sys_perf_event_open()
return with 22.
sys_perf_event_open(){
...
if (group_fd != -1)
perf_fget_light()//to get corresponding group_leader by group_fd
...
if (group_leader)
if (group_leader->ctx->task != ctx->task)//should on the same task
goto err_context
...
}
This patch also fixes this bug by introducing perf_evsel__remove_fd() and
update_fds to allow removing fds for the missing thread.
Changes since v1:
- Change group_fd__remove() into a more genetic way without changing code logic
- Remove redundant condition
Changes since v2:
- Use a proper function name and add some comment.
- Multiline comment style fixes.
Committer testing:
Before this patch the recently added 'perf stat --per-thread' for system
wide counting would race while enumerating all threads using /proc:
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]#
When, say, the kernel was being built, so lots of shortlived threads,
after this patch this doesn't happen.
Signed-off-by: Mengting Zhang <zhangmengting@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Cheng Jian <cj.chengjian@huawei.com>
Cc: Li Bin <huawei.libin@huawei.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1513148513-6974-1-git-send-email-zhangmengting@huawei.com
[ Remove one use 'evlist' alias variable ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-12-13 15:01:53 +08:00
|
|
|
int nr_cpus, int nr_threads,
|
|
|
|
int thread_idx)
|
|
|
|
{
|
|
|
|
for (int cpu = 0; cpu < nr_cpus; cpu++)
|
|
|
|
for (int thread = thread_idx; thread < nr_threads - 1; thread++)
|
|
|
|
FD(pos, cpu, thread) = FD(pos, cpu, thread + 1);
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
static int update_fds(struct evsel *evsel,
|
perf evsel: Enable ignore_missing_thread for pid option
While monitoring a multithread process with pid option, perf sometimes
may return sys_perf_event_open failure with 3(No such process) if any of
the process's threads die before we open the event. However, we want
perf continue monitoring the remaining threads and do not exit with
error.
Here, the patch enables perf_evsel::ignore_missing_thread for -p option
to ignore complete failure if any of threads die before we open the event.
But it may still return sys_perf_event_open failure with 22(Invalid) if we
monitors several event groups.
sys_perf_event_open: pid 28960 cpu 40 group_fd 118202 flags 0x8
sys_perf_event_open: pid 28961 cpu 40 group_fd 118203 flags 0x8
WARNING: Ignored open failure for pid 28962
sys_perf_event_open: pid 28962 cpu 40 group_fd [118203] flags 0x8
sys_perf_event_open failed, error -22
That is because when we ignore a missing thread, we change the thread_idx
without dealing with its fds, FD(evsel, cpu, thread). Then get_group_fd()
may return a wrong group_fd for the next thread and sys_perf_event_open()
return with 22.
sys_perf_event_open(){
...
if (group_fd != -1)
perf_fget_light()//to get corresponding group_leader by group_fd
...
if (group_leader)
if (group_leader->ctx->task != ctx->task)//should on the same task
goto err_context
...
}
This patch also fixes this bug by introducing perf_evsel__remove_fd() and
update_fds to allow removing fds for the missing thread.
Changes since v1:
- Change group_fd__remove() into a more genetic way without changing code logic
- Remove redundant condition
Changes since v2:
- Use a proper function name and add some comment.
- Multiline comment style fixes.
Committer testing:
Before this patch the recently added 'perf stat --per-thread' for system
wide counting would race while enumerating all threads using /proc:
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]#
When, say, the kernel was being built, so lots of shortlived threads,
after this patch this doesn't happen.
Signed-off-by: Mengting Zhang <zhangmengting@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Cheng Jian <cj.chengjian@huawei.com>
Cc: Li Bin <huawei.libin@huawei.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1513148513-6974-1-git-send-email-zhangmengting@huawei.com
[ Remove one use 'evlist' alias variable ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-12-13 15:01:53 +08:00
|
|
|
int nr_cpus, int cpu_idx,
|
|
|
|
int nr_threads, int thread_idx)
|
|
|
|
{
|
2019-07-21 19:23:51 +08:00
|
|
|
struct evsel *pos;
|
perf evsel: Enable ignore_missing_thread for pid option
While monitoring a multithread process with pid option, perf sometimes
may return sys_perf_event_open failure with 3(No such process) if any of
the process's threads die before we open the event. However, we want
perf continue monitoring the remaining threads and do not exit with
error.
Here, the patch enables perf_evsel::ignore_missing_thread for -p option
to ignore complete failure if any of threads die before we open the event.
But it may still return sys_perf_event_open failure with 22(Invalid) if we
monitors several event groups.
sys_perf_event_open: pid 28960 cpu 40 group_fd 118202 flags 0x8
sys_perf_event_open: pid 28961 cpu 40 group_fd 118203 flags 0x8
WARNING: Ignored open failure for pid 28962
sys_perf_event_open: pid 28962 cpu 40 group_fd [118203] flags 0x8
sys_perf_event_open failed, error -22
That is because when we ignore a missing thread, we change the thread_idx
without dealing with its fds, FD(evsel, cpu, thread). Then get_group_fd()
may return a wrong group_fd for the next thread and sys_perf_event_open()
return with 22.
sys_perf_event_open(){
...
if (group_fd != -1)
perf_fget_light()//to get corresponding group_leader by group_fd
...
if (group_leader)
if (group_leader->ctx->task != ctx->task)//should on the same task
goto err_context
...
}
This patch also fixes this bug by introducing perf_evsel__remove_fd() and
update_fds to allow removing fds for the missing thread.
Changes since v1:
- Change group_fd__remove() into a more genetic way without changing code logic
- Remove redundant condition
Changes since v2:
- Use a proper function name and add some comment.
- Multiline comment style fixes.
Committer testing:
Before this patch the recently added 'perf stat --per-thread' for system
wide counting would race while enumerating all threads using /proc:
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]#
When, say, the kernel was being built, so lots of shortlived threads,
after this patch this doesn't happen.
Signed-off-by: Mengting Zhang <zhangmengting@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Cheng Jian <cj.chengjian@huawei.com>
Cc: Li Bin <huawei.libin@huawei.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1513148513-6974-1-git-send-email-zhangmengting@huawei.com
[ Remove one use 'evlist' alias variable ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-12-13 15:01:53 +08:00
|
|
|
|
|
|
|
if (cpu_idx >= nr_cpus || thread_idx >= nr_threads)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
evlist__for_each_entry(evsel->evlist, pos) {
|
|
|
|
nr_cpus = pos != evsel ? nr_cpus : cpu_idx;
|
|
|
|
|
|
|
|
perf_evsel__remove_fd(pos, nr_cpus, nr_threads, thread_idx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Since fds for next evsel has not been created,
|
|
|
|
* there is no need to iterate whole event list.
|
|
|
|
*/
|
|
|
|
if (pos == evsel)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
static bool ignore_missing_thread(struct evsel *evsel,
|
perf evsel: Enable ignore_missing_thread for pid option
While monitoring a multithread process with pid option, perf sometimes
may return sys_perf_event_open failure with 3(No such process) if any of
the process's threads die before we open the event. However, we want
perf continue monitoring the remaining threads and do not exit with
error.
Here, the patch enables perf_evsel::ignore_missing_thread for -p option
to ignore complete failure if any of threads die before we open the event.
But it may still return sys_perf_event_open failure with 22(Invalid) if we
monitors several event groups.
sys_perf_event_open: pid 28960 cpu 40 group_fd 118202 flags 0x8
sys_perf_event_open: pid 28961 cpu 40 group_fd 118203 flags 0x8
WARNING: Ignored open failure for pid 28962
sys_perf_event_open: pid 28962 cpu 40 group_fd [118203] flags 0x8
sys_perf_event_open failed, error -22
That is because when we ignore a missing thread, we change the thread_idx
without dealing with its fds, FD(evsel, cpu, thread). Then get_group_fd()
may return a wrong group_fd for the next thread and sys_perf_event_open()
return with 22.
sys_perf_event_open(){
...
if (group_fd != -1)
perf_fget_light()//to get corresponding group_leader by group_fd
...
if (group_leader)
if (group_leader->ctx->task != ctx->task)//should on the same task
goto err_context
...
}
This patch also fixes this bug by introducing perf_evsel__remove_fd() and
update_fds to allow removing fds for the missing thread.
Changes since v1:
- Change group_fd__remove() into a more genetic way without changing code logic
- Remove redundant condition
Changes since v2:
- Use a proper function name and add some comment.
- Multiline comment style fixes.
Committer testing:
Before this patch the recently added 'perf stat --per-thread' for system
wide counting would race while enumerating all threads using /proc:
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]#
When, say, the kernel was being built, so lots of shortlived threads,
after this patch this doesn't happen.
Signed-off-by: Mengting Zhang <zhangmengting@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Cheng Jian <cj.chengjian@huawei.com>
Cc: Li Bin <huawei.libin@huawei.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1513148513-6974-1-git-send-email-zhangmengting@huawei.com
[ Remove one use 'evlist' alias variable ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-12-13 15:01:53 +08:00
|
|
|
int nr_cpus, int cpu,
|
2019-07-21 19:23:50 +08:00
|
|
|
struct perf_thread_map *threads,
|
2016-12-13 15:46:22 +08:00
|
|
|
int thread, int err)
|
|
|
|
{
|
2019-08-22 19:11:41 +08:00
|
|
|
pid_t ignore_pid = perf_thread_map__pid(threads, thread);
|
perf evsel: Enable ignore_missing_thread for pid option
While monitoring a multithread process with pid option, perf sometimes
may return sys_perf_event_open failure with 3(No such process) if any of
the process's threads die before we open the event. However, we want
perf continue monitoring the remaining threads and do not exit with
error.
Here, the patch enables perf_evsel::ignore_missing_thread for -p option
to ignore complete failure if any of threads die before we open the event.
But it may still return sys_perf_event_open failure with 22(Invalid) if we
monitors several event groups.
sys_perf_event_open: pid 28960 cpu 40 group_fd 118202 flags 0x8
sys_perf_event_open: pid 28961 cpu 40 group_fd 118203 flags 0x8
WARNING: Ignored open failure for pid 28962
sys_perf_event_open: pid 28962 cpu 40 group_fd [118203] flags 0x8
sys_perf_event_open failed, error -22
That is because when we ignore a missing thread, we change the thread_idx
without dealing with its fds, FD(evsel, cpu, thread). Then get_group_fd()
may return a wrong group_fd for the next thread and sys_perf_event_open()
return with 22.
sys_perf_event_open(){
...
if (group_fd != -1)
perf_fget_light()//to get corresponding group_leader by group_fd
...
if (group_leader)
if (group_leader->ctx->task != ctx->task)//should on the same task
goto err_context
...
}
This patch also fixes this bug by introducing perf_evsel__remove_fd() and
update_fds to allow removing fds for the missing thread.
Changes since v1:
- Change group_fd__remove() into a more genetic way without changing code logic
- Remove redundant condition
Changes since v2:
- Use a proper function name and add some comment.
- Multiline comment style fixes.
Committer testing:
Before this patch the recently added 'perf stat --per-thread' for system
wide counting would race while enumerating all threads using /proc:
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]#
When, say, the kernel was being built, so lots of shortlived threads,
after this patch this doesn't happen.
Signed-off-by: Mengting Zhang <zhangmengting@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Cheng Jian <cj.chengjian@huawei.com>
Cc: Li Bin <huawei.libin@huawei.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1513148513-6974-1-git-send-email-zhangmengting@huawei.com
[ Remove one use 'evlist' alias variable ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-12-13 15:01:53 +08:00
|
|
|
|
2016-12-13 15:46:22 +08:00
|
|
|
if (!evsel->ignore_missing_thread)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* The system wide setup does not work with threads. */
|
2019-08-06 17:35:19 +08:00
|
|
|
if (evsel->core.system_wide)
|
2016-12-13 15:46:22 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
/* The -ESRCH is perf event syscall errno for pid's not found. */
|
|
|
|
if (err != -ESRCH)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* If there's only one thread, let it fail. */
|
|
|
|
if (threads->nr == 1)
|
|
|
|
return false;
|
|
|
|
|
perf evsel: Enable ignore_missing_thread for pid option
While monitoring a multithread process with pid option, perf sometimes
may return sys_perf_event_open failure with 3(No such process) if any of
the process's threads die before we open the event. However, we want
perf continue monitoring the remaining threads and do not exit with
error.
Here, the patch enables perf_evsel::ignore_missing_thread for -p option
to ignore complete failure if any of threads die before we open the event.
But it may still return sys_perf_event_open failure with 22(Invalid) if we
monitors several event groups.
sys_perf_event_open: pid 28960 cpu 40 group_fd 118202 flags 0x8
sys_perf_event_open: pid 28961 cpu 40 group_fd 118203 flags 0x8
WARNING: Ignored open failure for pid 28962
sys_perf_event_open: pid 28962 cpu 40 group_fd [118203] flags 0x8
sys_perf_event_open failed, error -22
That is because when we ignore a missing thread, we change the thread_idx
without dealing with its fds, FD(evsel, cpu, thread). Then get_group_fd()
may return a wrong group_fd for the next thread and sys_perf_event_open()
return with 22.
sys_perf_event_open(){
...
if (group_fd != -1)
perf_fget_light()//to get corresponding group_leader by group_fd
...
if (group_leader)
if (group_leader->ctx->task != ctx->task)//should on the same task
goto err_context
...
}
This patch also fixes this bug by introducing perf_evsel__remove_fd() and
update_fds to allow removing fds for the missing thread.
Changes since v1:
- Change group_fd__remove() into a more genetic way without changing code logic
- Remove redundant condition
Changes since v2:
- Use a proper function name and add some comment.
- Multiline comment style fixes.
Committer testing:
Before this patch the recently added 'perf stat --per-thread' for system
wide counting would race while enumerating all threads using /proc:
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]#
When, say, the kernel was being built, so lots of shortlived threads,
after this patch this doesn't happen.
Signed-off-by: Mengting Zhang <zhangmengting@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Cheng Jian <cj.chengjian@huawei.com>
Cc: Li Bin <huawei.libin@huawei.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1513148513-6974-1-git-send-email-zhangmengting@huawei.com
[ Remove one use 'evlist' alias variable ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-12-13 15:01:53 +08:00
|
|
|
/*
|
|
|
|
* We should remove fd for missing_thread first
|
|
|
|
* because thread_map__remove() will decrease threads->nr.
|
|
|
|
*/
|
|
|
|
if (update_fds(evsel, nr_cpus, cpu, threads->nr, thread))
|
|
|
|
return false;
|
|
|
|
|
2016-12-13 15:46:22 +08:00
|
|
|
if (thread_map__remove(threads, thread))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
pr_warning("WARNING: Ignored open failure for pid %d\n",
|
perf evsel: Enable ignore_missing_thread for pid option
While monitoring a multithread process with pid option, perf sometimes
may return sys_perf_event_open failure with 3(No such process) if any of
the process's threads die before we open the event. However, we want
perf continue monitoring the remaining threads and do not exit with
error.
Here, the patch enables perf_evsel::ignore_missing_thread for -p option
to ignore complete failure if any of threads die before we open the event.
But it may still return sys_perf_event_open failure with 22(Invalid) if we
monitors several event groups.
sys_perf_event_open: pid 28960 cpu 40 group_fd 118202 flags 0x8
sys_perf_event_open: pid 28961 cpu 40 group_fd 118203 flags 0x8
WARNING: Ignored open failure for pid 28962
sys_perf_event_open: pid 28962 cpu 40 group_fd [118203] flags 0x8
sys_perf_event_open failed, error -22
That is because when we ignore a missing thread, we change the thread_idx
without dealing with its fds, FD(evsel, cpu, thread). Then get_group_fd()
may return a wrong group_fd for the next thread and sys_perf_event_open()
return with 22.
sys_perf_event_open(){
...
if (group_fd != -1)
perf_fget_light()//to get corresponding group_leader by group_fd
...
if (group_leader)
if (group_leader->ctx->task != ctx->task)//should on the same task
goto err_context
...
}
This patch also fixes this bug by introducing perf_evsel__remove_fd() and
update_fds to allow removing fds for the missing thread.
Changes since v1:
- Change group_fd__remove() into a more genetic way without changing code logic
- Remove redundant condition
Changes since v2:
- Use a proper function name and add some comment.
- Multiline comment style fixes.
Committer testing:
Before this patch the recently added 'perf stat --per-thread' for system
wide counting would race while enumerating all threads using /proc:
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]#
When, say, the kernel was being built, so lots of shortlived threads,
after this patch this doesn't happen.
Signed-off-by: Mengting Zhang <zhangmengting@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Cheng Jian <cj.chengjian@huawei.com>
Cc: Li Bin <huawei.libin@huawei.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1513148513-6974-1-git-send-email-zhangmengting@huawei.com
[ Remove one use 'evlist' alias variable ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-12-13 15:01:53 +08:00
|
|
|
ignore_pid);
|
2016-12-13 15:46:22 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-09-25 02:41:51 +08:00
|
|
|
static int __open_attr__fprintf(FILE *fp, const char *name, const char *val,
|
|
|
|
void *priv __maybe_unused)
|
|
|
|
{
|
|
|
|
return fprintf(fp, " %-32s %s\n", name, val);
|
|
|
|
}
|
|
|
|
|
2019-03-14 22:00:10 +08:00
|
|
|
static void display_attr(struct perf_event_attr *attr)
|
|
|
|
{
|
2019-11-08 17:41:28 +08:00
|
|
|
if (verbose >= 2 || debug_peo_args) {
|
2019-03-14 22:00:10 +08:00
|
|
|
fprintf(stderr, "%.60s\n", graph_dotted_line);
|
|
|
|
fprintf(stderr, "perf_event_attr:\n");
|
|
|
|
perf_event_attr__fprintf(stderr, attr, __open_attr__fprintf, NULL);
|
|
|
|
fprintf(stderr, "%.60s\n", graph_dotted_line);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
static int perf_event_open(struct evsel *evsel,
|
2019-03-14 22:00:10 +08:00
|
|
|
pid_t pid, int cpu, int group_fd,
|
|
|
|
unsigned long flags)
|
|
|
|
{
|
2019-07-21 19:24:29 +08:00
|
|
|
int precise_ip = evsel->core.attr.precise_ip;
|
2019-03-14 22:00:10 +08:00
|
|
|
int fd;
|
|
|
|
|
|
|
|
while (1) {
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("sys_perf_event_open: pid %d cpu %d group_fd %d flags %#lx",
|
2019-03-14 22:00:10 +08:00
|
|
|
pid, cpu, group_fd, flags);
|
|
|
|
|
2019-07-21 19:24:29 +08:00
|
|
|
fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, group_fd, flags);
|
2019-03-14 22:00:10 +08:00
|
|
|
if (fd >= 0)
|
|
|
|
break;
|
|
|
|
|
2019-07-03 16:09:49 +08:00
|
|
|
/* Do not try less precise if not requested. */
|
|
|
|
if (!evsel->precise_max)
|
2019-03-14 22:00:10 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We tried all the precise_ip values, and it's
|
|
|
|
* still failing, so leave it to standard fallback.
|
|
|
|
*/
|
2019-07-21 19:24:29 +08:00
|
|
|
if (!evsel->core.attr.precise_ip) {
|
|
|
|
evsel->core.attr.precise_ip = precise_ip;
|
2019-03-14 22:00:10 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("\nsys_perf_event_open failed, error %d\n", -ENOTSUP);
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.precise_ip--;
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("decreasing precise_ip by one (%d)\n", evsel->core.attr.precise_ip);
|
2019-07-21 19:24:29 +08:00
|
|
|
display_attr(&evsel->core.attr);
|
2019-03-14 22:00:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return fd;
|
|
|
|
}
|
|
|
|
|
2019-11-21 08:15:19 +08:00
|
|
|
static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
|
|
|
|
struct perf_thread_map *threads,
|
|
|
|
int start_cpu, int end_cpu)
|
2011-01-04 03:48:12 +08:00
|
|
|
{
|
2014-07-31 14:00:51 +08:00
|
|
|
int cpu, thread, nthreads;
|
2014-07-01 04:28:47 +08:00
|
|
|
unsigned long flags = PERF_FLAG_FD_CLOEXEC;
|
2019-10-21 01:51:54 +08:00
|
|
|
int pid = -1, err, old_errno;
|
2013-08-05 10:41:26 +08:00
|
|
|
enum { NO_CHANGE, SET_TO_MAX, INCREASED_MAX } set_rlimit = NO_CHANGE;
|
2011-01-04 03:48:12 +08:00
|
|
|
|
2019-08-13 22:06:38 +08:00
|
|
|
if ((perf_missing_features.write_backward && evsel->core.attr.write_backward) ||
|
|
|
|
(perf_missing_features.aux_output && evsel->core.attr.aux_output))
|
2016-07-14 16:34:33 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
2017-02-14 21:59:04 +08:00
|
|
|
if (cpus == NULL) {
|
2019-07-21 19:23:49 +08:00
|
|
|
static struct perf_cpu_map *empty_cpu_map;
|
2017-02-14 21:59:04 +08:00
|
|
|
|
|
|
|
if (empty_cpu_map == NULL) {
|
2019-07-21 19:24:16 +08:00
|
|
|
empty_cpu_map = perf_cpu_map__dummy_new();
|
2017-02-14 21:59:04 +08:00
|
|
|
if (empty_cpu_map == NULL)
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
cpus = empty_cpu_map;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (threads == NULL) {
|
2019-07-21 19:23:50 +08:00
|
|
|
static struct perf_thread_map *empty_thread_map;
|
2017-02-14 21:59:04 +08:00
|
|
|
|
|
|
|
if (empty_thread_map == NULL) {
|
|
|
|
empty_thread_map = thread_map__new_by_tid(-1);
|
|
|
|
if (empty_thread_map == NULL)
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
threads = empty_thread_map;
|
|
|
|
}
|
|
|
|
|
2019-08-06 17:35:19 +08:00
|
|
|
if (evsel->core.system_wide)
|
2014-07-31 14:00:51 +08:00
|
|
|
nthreads = 1;
|
|
|
|
else
|
|
|
|
nthreads = threads->nr;
|
|
|
|
|
2019-07-21 19:24:45 +08:00
|
|
|
if (evsel->core.fd == NULL &&
|
2019-07-21 19:24:48 +08:00
|
|
|
perf_evsel__alloc_fd(&evsel->core, cpus->nr, nthreads) < 0)
|
2011-10-25 20:42:19 +08:00
|
|
|
return -ENOMEM;
|
2011-01-04 10:13:17 +08:00
|
|
|
|
perf tool: Add cgroup support
This patch adds the ability to filter monitoring based on container groups
(cgroups) for both perf stat and perf record. It is possible to monitor
multiple cgroup in parallel. There is one cgroup per event. The cgroups to
monitor are passed via a new -G option followed by a comma separated list of
cgroup names.
The cgroup filesystem has to be mounted. Given a cgroup name, the perf tool
finds the corresponding directory in the cgroup filesystem and opens it. It
then passes that file descriptor to the kernel.
Example:
$ perf stat -B -a -e cycles:u,cycles:u,cycles:u -G test1,,test2 -- sleep 1
Performance counter stats for 'sleep 1':
2,368,667,414 cycles test1
2,369,661,459 cycles
<not counted> cycles test2
1.001856890 seconds time elapsed
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4d590290.825bdf0a.7d0a.4890@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-02-14 17:20:01 +08:00
|
|
|
if (evsel->cgrp) {
|
2014-07-01 04:28:47 +08:00
|
|
|
flags |= PERF_FLAG_PID_CGROUP;
|
perf tool: Add cgroup support
This patch adds the ability to filter monitoring based on container groups
(cgroups) for both perf stat and perf record. It is possible to monitor
multiple cgroup in parallel. There is one cgroup per event. The cgroups to
monitor are passed via a new -G option followed by a comma separated list of
cgroup names.
The cgroup filesystem has to be mounted. Given a cgroup name, the perf tool
finds the corresponding directory in the cgroup filesystem and opens it. It
then passes that file descriptor to the kernel.
Example:
$ perf stat -B -a -e cycles:u,cycles:u,cycles:u -G test1,,test2 -- sleep 1
Performance counter stats for 'sleep 1':
2,368,667,414 cycles test1
2,369,661,459 cycles
<not counted> cycles test2
1.001856890 seconds time elapsed
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4d590290.825bdf0a.7d0a.4890@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-02-14 17:20:01 +08:00
|
|
|
pid = evsel->cgrp->fd;
|
|
|
|
}
|
|
|
|
|
2012-12-14 00:13:07 +08:00
|
|
|
fallback_missing_features:
|
2015-03-31 06:19:31 +08:00
|
|
|
if (perf_missing_features.clockid_wrong)
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.clockid = CLOCK_MONOTONIC; /* should always work */
|
2015-03-31 06:19:31 +08:00
|
|
|
if (perf_missing_features.clockid) {
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.use_clockid = 0;
|
|
|
|
evsel->core.attr.clockid = 0;
|
2015-03-31 06:19:31 +08:00
|
|
|
}
|
2014-07-01 04:28:47 +08:00
|
|
|
if (perf_missing_features.cloexec)
|
|
|
|
flags &= ~(unsigned long)PERF_FLAG_FD_CLOEXEC;
|
2013-08-21 18:10:25 +08:00
|
|
|
if (perf_missing_features.mmap2)
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.mmap2 = 0;
|
2012-12-14 00:13:07 +08:00
|
|
|
if (perf_missing_features.exclude_guest)
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.exclude_guest = evsel->core.attr.exclude_host = 0;
|
2015-12-12 08:12:24 +08:00
|
|
|
if (perf_missing_features.lbr_flags)
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.branch_sample_type &= ~(PERF_SAMPLE_BRANCH_NO_FLAGS |
|
2015-12-12 08:12:24 +08:00
|
|
|
PERF_SAMPLE_BRANCH_NO_CYCLES);
|
2019-07-21 19:24:29 +08:00
|
|
|
if (perf_missing_features.group_read && evsel->core.attr.inherit)
|
|
|
|
evsel->core.attr.read_format &= ~(PERF_FORMAT_GROUP|PERF_FORMAT_ID);
|
2019-01-18 00:15:17 +08:00
|
|
|
if (perf_missing_features.ksymbol)
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.ksymbol = 0;
|
2019-08-27 06:31:06 +08:00
|
|
|
if (perf_missing_features.bpf)
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.bpf_event = 0;
|
perf evsel: Support PERF_SAMPLE_BRANCH_HW_INDEX
A new branch sample type PERF_SAMPLE_BRANCH_HW_INDEX has been introduced
in latest kernel.
Enable HW_INDEX by default in LBR call stack mode.
If kernel doesn't support the sample type, switching it off.
Add HW_INDEX in attr_fprintf as well. User can check whether the branch
sample type is set via debug information or header.
Committer testing:
First collect some samples with LBR callchains, system wide, for a few
seconds:
# perf record --call-graph lbr -a sleep 5
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.625 MB perf.data (224 samples) ]
#
Now lets use 'perf evlist -v' to look at the branch_sample_type:
# perf evlist -v
cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|BRANCH_STACK, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, branch_sample_type: USER|CALL_STACK|NO_FLAGS|NO_CYCLES|HW_INDEX
#
So the machine has the kernel feature, and it was correctly added to
perf_event_attr.branch_sample_type, for the default 'cycles' event.
If we do it in another machine, where the kernel lacks the HW_INDEX
feature, we get:
# perf record --call-graph lbr -a sleep 2s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 1.690 MB perf.data (499 samples) ]
# perf evlist -v
cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|BRANCH_STACK, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, branch_sample_type: USER|CALL_STACK|NO_FLAGS|NO_CYCLES
#
No HW_INDEX in attr.branch_sample_type.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexey Budankov <alexey.budankov@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Pavel Gerasimov <pavel.gerasimov@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vitaly Slobodskoy <vitaly.slobodskoy@intel.com>
Link: http://lore.kernel.org/lkml/20200228163011.19358-3-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2020-02-29 00:30:01 +08:00
|
|
|
if (perf_missing_features.branch_hw_idx)
|
|
|
|
evsel->core.attr.branch_sample_type &= ~PERF_SAMPLE_BRANCH_HW_INDEX;
|
2012-12-14 00:13:07 +08:00
|
|
|
retry_sample_id:
|
|
|
|
if (perf_missing_features.sample_id_all)
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.sample_id_all = 0;
|
2012-12-14 00:13:07 +08:00
|
|
|
|
2019-07-21 19:24:29 +08:00
|
|
|
display_attr(&evsel->core.attr);
|
2013-08-14 20:48:24 +08:00
|
|
|
|
2019-11-21 08:15:19 +08:00
|
|
|
for (cpu = start_cpu; cpu < end_cpu; cpu++) {
|
2011-01-12 10:08:18 +08:00
|
|
|
|
2014-07-31 14:00:51 +08:00
|
|
|
for (thread = 0; thread < nthreads; thread++) {
|
2016-12-12 18:35:40 +08:00
|
|
|
int fd, group_fd;
|
perf tool: Add cgroup support
This patch adds the ability to filter monitoring based on container groups
(cgroups) for both perf stat and perf record. It is possible to monitor
multiple cgroup in parallel. There is one cgroup per event. The cgroups to
monitor are passed via a new -G option followed by a comma separated list of
cgroup names.
The cgroup filesystem has to be mounted. Given a cgroup name, the perf tool
finds the corresponding directory in the cgroup filesystem and opens it. It
then passes that file descriptor to the kernel.
Example:
$ perf stat -B -a -e cycles:u,cycles:u,cycles:u -G test1,,test2 -- sleep 1
Performance counter stats for 'sleep 1':
2,368,667,414 cycles test1
2,369,661,459 cycles
<not counted> cycles test2
1.001856890 seconds time elapsed
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4d590290.825bdf0a.7d0a.4890@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-02-14 17:20:01 +08:00
|
|
|
|
2019-08-06 17:35:19 +08:00
|
|
|
if (!evsel->cgrp && !evsel->core.system_wide)
|
2019-08-22 19:11:41 +08:00
|
|
|
pid = perf_thread_map__pid(threads, thread);
|
perf tool: Add cgroup support
This patch adds the ability to filter monitoring based on container groups
(cgroups) for both perf stat and perf record. It is possible to monitor
multiple cgroup in parallel. There is one cgroup per event. The cgroups to
monitor are passed via a new -G option followed by a comma separated list of
cgroup names.
The cgroup filesystem has to be mounted. Given a cgroup name, the perf tool
finds the corresponding directory in the cgroup filesystem and opens it. It
then passes that file descriptor to the kernel.
Example:
$ perf stat -B -a -e cycles:u,cycles:u,cycles:u -G test1,,test2 -- sleep 1
Performance counter stats for 'sleep 1':
2,368,667,414 cycles test1
2,369,661,459 cycles
<not counted> cycles test2
1.001856890 seconds time elapsed
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4d590290.825bdf0a.7d0a.4890@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-02-14 17:20:01 +08:00
|
|
|
|
perf tools: Enable grouping logic for parsed events
This patch adds a functionality that allows to create event groups
based on the way they are specified on the command line. Adding
functionality to the '{}' group syntax introduced in earlier patch.
The current '--group/-g' option behaviour remains intact. If you
specify it for record/stat/top command, all the specified events
become members of a single group with the first event as a group
leader.
With the new '{}' group syntax you can create group like:
# perf record -e '{cycles,faults}' ls
resulting in single event group containing 'cycles' and 'faults'
events, with cycles event as group leader.
All groups are created with regards to threads and cpus. Thus
recording an event group within a 2 threads on server with
4 CPUs will create 8 separate groups.
Examples (first event in brackets is group leader):
# 1 group (cpu-clock,task-clock)
perf record --group -e cpu-clock,task-clock ls
perf record -e '{cpu-clock,task-clock}' ls
# 2 groups (cpu-clock,task-clock) (minor-faults,major-faults)
perf record -e '{cpu-clock,task-clock},{minor-faults,major-faults}' ls
# 1 group (cpu-clock,task-clock,minor-faults,major-faults)
perf record --group -e cpu-clock,task-clock -e minor-faults,major-faults ls
perf record -e '{cpu-clock,task-clock,minor-faults,major-faults}' ls
# 2 groups (cpu-clock,task-clock) (minor-faults,major-faults)
perf record -e '{cpu-clock,task-clock} -e '{minor-faults,major-faults}' \
-e instructions ls
# 1 group
# (cpu-clock,task-clock,minor-faults,major-faults,instructions)
perf record --group -e cpu-clock,task-clock \
-e minor-faults,major-faults -e instructions ls perf record -e
'{cpu-clock,task-clock,minor-faults,major-faults,instructions}' ls
It's possible to use standard event modifier for a group, which spans
over all events in the group and updates each event modifier settings,
for example:
# perf record -r '{faults:k,cache-references}:p'
resulting in ':kp' modifier being used for 'faults' and ':p' modifier
being used for 'cache-references' event.
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ulrich Drepper <drepper@gmail.com>
Link: http://lkml.kernel.org/n/tip-ho42u0wcr8mn1otkalqi13qp@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2012-08-08 18:22:36 +08:00
|
|
|
group_fd = get_group_fd(evsel, cpu, thread);
|
2013-08-05 10:41:26 +08:00
|
|
|
retry_open:
|
2017-07-03 22:50:18 +08:00
|
|
|
test_attr__ready();
|
|
|
|
|
2019-03-14 22:00:10 +08:00
|
|
|
fd = perf_event_open(evsel, pid, cpus->map[cpu],
|
|
|
|
group_fd, flags);
|
2016-12-12 18:35:40 +08:00
|
|
|
|
|
|
|
FD(evsel, cpu, thread) = fd;
|
|
|
|
|
|
|
|
if (fd < 0) {
|
2011-10-25 20:42:19 +08:00
|
|
|
err = -errno;
|
2016-12-13 15:46:22 +08:00
|
|
|
|
perf evsel: Enable ignore_missing_thread for pid option
While monitoring a multithread process with pid option, perf sometimes
may return sys_perf_event_open failure with 3(No such process) if any of
the process's threads die before we open the event. However, we want
perf continue monitoring the remaining threads and do not exit with
error.
Here, the patch enables perf_evsel::ignore_missing_thread for -p option
to ignore complete failure if any of threads die before we open the event.
But it may still return sys_perf_event_open failure with 22(Invalid) if we
monitors several event groups.
sys_perf_event_open: pid 28960 cpu 40 group_fd 118202 flags 0x8
sys_perf_event_open: pid 28961 cpu 40 group_fd 118203 flags 0x8
WARNING: Ignored open failure for pid 28962
sys_perf_event_open: pid 28962 cpu 40 group_fd [118203] flags 0x8
sys_perf_event_open failed, error -22
That is because when we ignore a missing thread, we change the thread_idx
without dealing with its fds, FD(evsel, cpu, thread). Then get_group_fd()
may return a wrong group_fd for the next thread and sys_perf_event_open()
return with 22.
sys_perf_event_open(){
...
if (group_fd != -1)
perf_fget_light()//to get corresponding group_leader by group_fd
...
if (group_leader)
if (group_leader->ctx->task != ctx->task)//should on the same task
goto err_context
...
}
This patch also fixes this bug by introducing perf_evsel__remove_fd() and
update_fds to allow removing fds for the missing thread.
Changes since v1:
- Change group_fd__remove() into a more genetic way without changing code logic
- Remove redundant condition
Changes since v2:
- Use a proper function name and add some comment.
- Multiline comment style fixes.
Committer testing:
Before this patch the recently added 'perf stat --per-thread' for system
wide counting would race while enumerating all threads using /proc:
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]# perf stat --per-thread
failed to parse CPUs map: No such file or directory
Usage: perf stat [<options>] [<command>]
-C, --cpu <cpu> list of cpus to monitor in system-wide
-a, --all-cpus system-wide collection from all CPUs
[root@jouet ~]#
When, say, the kernel was being built, so lots of shortlived threads,
after this patch this doesn't happen.
Signed-off-by: Mengting Zhang <zhangmengting@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Cheng Jian <cj.chengjian@huawei.com>
Cc: Li Bin <huawei.libin@huawei.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1513148513-6974-1-git-send-email-zhangmengting@huawei.com
[ Remove one use 'evlist' alias variable ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-12-13 15:01:53 +08:00
|
|
|
if (ignore_missing_thread(evsel, cpus->nr, cpu, threads, thread, err)) {
|
2016-12-13 15:46:22 +08:00
|
|
|
/*
|
|
|
|
* We just removed 1 thread, so take a step
|
|
|
|
* back on thread index and lower the upper
|
|
|
|
* nthreads limit.
|
|
|
|
*/
|
|
|
|
nthreads--;
|
|
|
|
thread--;
|
|
|
|
|
|
|
|
/* ... and pretend like nothing have happened. */
|
|
|
|
err = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("\nsys_perf_event_open failed, error %d\n",
|
2013-11-01 21:51:29 +08:00
|
|
|
err);
|
2012-12-14 00:13:07 +08:00
|
|
|
goto try_fallback;
|
2011-10-25 20:42:19 +08:00
|
|
|
}
|
perf bpf: Attach eBPF filter to perf event
This is the final patch which makes basic BPF filter work. After
applying this patch, users are allowed to use BPF filter like:
# perf record --event ./hello_world.o ls
A bpf_fd field is appended to 'struct evsel', and setup during the
callback function add_bpf_event() for each 'probe_trace_event'.
PERF_EVENT_IOC_SET_BPF ioctl is used to attach eBPF program to a newly
created perf event. The file descriptor of the eBPF program is passed to
perf record using previous patches, and stored into evsel->bpf_fd.
It is possible that different perf event are created for one kprobe
events for different CPUs. In this case, when trying to call the ioctl,
EEXIST will be return. This patch doesn't treat it as an error.
Committer note:
The bpf proggie used so far:
__attribute__((section("fork=_do_fork"), used))
int fork(void *ctx)
{
return 0;
}
char _license[] __attribute__((section("license"), used)) = "GPL";
int _version __attribute__((section("version"), used)) = 0x40300;
failed to produce any samples, even with forks happening and it being
running in system wide mode.
That is because now the filter is being associated, and the code above
always returns zero, meaning that all forks will be probed but filtered
away ;-/
Change it to 'return 1;' instead and after that:
# trace --no-syscalls --event /tmp/foo.o
0.000 perf_bpf_probe:fork:(ffffffff8109be30))
2.333 perf_bpf_probe:fork:(ffffffff8109be30))
3.725 perf_bpf_probe:fork:(ffffffff8109be30))
4.550 perf_bpf_probe:fork:(ffffffff8109be30))
^C#
And it works with all tools, including 'perf trace'.
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kaixu Xia <xiakaixu@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1444826502-49291-8-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-10-14 20:41:18 +08:00
|
|
|
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo(" = %d\n", fd);
|
2016-11-22 05:33:26 +08:00
|
|
|
|
perf bpf: Attach eBPF filter to perf event
This is the final patch which makes basic BPF filter work. After
applying this patch, users are allowed to use BPF filter like:
# perf record --event ./hello_world.o ls
A bpf_fd field is appended to 'struct evsel', and setup during the
callback function add_bpf_event() for each 'probe_trace_event'.
PERF_EVENT_IOC_SET_BPF ioctl is used to attach eBPF program to a newly
created perf event. The file descriptor of the eBPF program is passed to
perf record using previous patches, and stored into evsel->bpf_fd.
It is possible that different perf event are created for one kprobe
events for different CPUs. In this case, when trying to call the ioctl,
EEXIST will be return. This patch doesn't treat it as an error.
Committer note:
The bpf proggie used so far:
__attribute__((section("fork=_do_fork"), used))
int fork(void *ctx)
{
return 0;
}
char _license[] __attribute__((section("license"), used)) = "GPL";
int _version __attribute__((section("version"), used)) = 0x40300;
failed to produce any samples, even with forks happening and it being
running in system wide mode.
That is because now the filter is being associated, and the code above
always returns zero, meaning that all forks will be probed but filtered
away ;-/
Change it to 'return 1;' instead and after that:
# trace --no-syscalls --event /tmp/foo.o
0.000 perf_bpf_probe:fork:(ffffffff8109be30))
2.333 perf_bpf_probe:fork:(ffffffff8109be30))
3.725 perf_bpf_probe:fork:(ffffffff8109be30))
4.550 perf_bpf_probe:fork:(ffffffff8109be30))
^C#
And it works with all tools, including 'perf trace'.
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kaixu Xia <xiakaixu@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1444826502-49291-8-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-10-14 20:41:18 +08:00
|
|
|
if (evsel->bpf_fd >= 0) {
|
2016-12-12 18:35:40 +08:00
|
|
|
int evt_fd = fd;
|
perf bpf: Attach eBPF filter to perf event
This is the final patch which makes basic BPF filter work. After
applying this patch, users are allowed to use BPF filter like:
# perf record --event ./hello_world.o ls
A bpf_fd field is appended to 'struct evsel', and setup during the
callback function add_bpf_event() for each 'probe_trace_event'.
PERF_EVENT_IOC_SET_BPF ioctl is used to attach eBPF program to a newly
created perf event. The file descriptor of the eBPF program is passed to
perf record using previous patches, and stored into evsel->bpf_fd.
It is possible that different perf event are created for one kprobe
events for different CPUs. In this case, when trying to call the ioctl,
EEXIST will be return. This patch doesn't treat it as an error.
Committer note:
The bpf proggie used so far:
__attribute__((section("fork=_do_fork"), used))
int fork(void *ctx)
{
return 0;
}
char _license[] __attribute__((section("license"), used)) = "GPL";
int _version __attribute__((section("version"), used)) = 0x40300;
failed to produce any samples, even with forks happening and it being
running in system wide mode.
That is because now the filter is being associated, and the code above
always returns zero, meaning that all forks will be probed but filtered
away ;-/
Change it to 'return 1;' instead and after that:
# trace --no-syscalls --event /tmp/foo.o
0.000 perf_bpf_probe:fork:(ffffffff8109be30))
2.333 perf_bpf_probe:fork:(ffffffff8109be30))
3.725 perf_bpf_probe:fork:(ffffffff8109be30))
4.550 perf_bpf_probe:fork:(ffffffff8109be30))
^C#
And it works with all tools, including 'perf trace'.
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kaixu Xia <xiakaixu@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1444826502-49291-8-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-10-14 20:41:18 +08:00
|
|
|
int bpf_fd = evsel->bpf_fd;
|
|
|
|
|
|
|
|
err = ioctl(evt_fd,
|
|
|
|
PERF_EVENT_IOC_SET_BPF,
|
|
|
|
bpf_fd);
|
|
|
|
if (err && errno != EEXIST) {
|
|
|
|
pr_err("failed to attach bpf fd %d: %s\n",
|
|
|
|
bpf_fd, strerror(errno));
|
|
|
|
err = -EINVAL;
|
|
|
|
goto out_close;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-08-05 10:41:26 +08:00
|
|
|
set_rlimit = NO_CHANGE;
|
2015-03-31 06:19:31 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we succeeded but had to kill clockid, fail and
|
|
|
|
* have perf_evsel__open_strerror() print us a nice
|
|
|
|
* error.
|
|
|
|
*/
|
|
|
|
if (perf_missing_features.clockid ||
|
|
|
|
perf_missing_features.clockid_wrong) {
|
|
|
|
err = -EINVAL;
|
|
|
|
goto out_close;
|
|
|
|
}
|
2011-01-04 21:55:27 +08:00
|
|
|
}
|
2011-01-04 03:48:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
2012-12-14 00:13:07 +08:00
|
|
|
try_fallback:
|
2013-08-05 10:41:26 +08:00
|
|
|
/*
|
|
|
|
* perf stat needs between 5 and 22 fds per CPU. When we run out
|
|
|
|
* of them try to increase the limits.
|
|
|
|
*/
|
|
|
|
if (err == -EMFILE && set_rlimit < INCREASED_MAX) {
|
|
|
|
struct rlimit l;
|
|
|
|
|
2019-10-21 01:51:54 +08:00
|
|
|
old_errno = errno;
|
2013-08-05 10:41:26 +08:00
|
|
|
if (getrlimit(RLIMIT_NOFILE, &l) == 0) {
|
|
|
|
if (set_rlimit == NO_CHANGE)
|
|
|
|
l.rlim_cur = l.rlim_max;
|
|
|
|
else {
|
|
|
|
l.rlim_cur = l.rlim_max + 1000;
|
|
|
|
l.rlim_max = l.rlim_cur;
|
|
|
|
}
|
|
|
|
if (setrlimit(RLIMIT_NOFILE, &l) == 0) {
|
|
|
|
set_rlimit++;
|
|
|
|
errno = old_errno;
|
|
|
|
goto retry_open;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
errno = old_errno;
|
|
|
|
}
|
|
|
|
|
2012-12-14 00:13:07 +08:00
|
|
|
if (err != -EINVAL || cpu > 0 || thread > 0)
|
|
|
|
goto out_close;
|
|
|
|
|
2015-03-31 06:19:31 +08:00
|
|
|
/*
|
|
|
|
* Must probe features in the order they were added to the
|
|
|
|
* perf_event_attr interface.
|
|
|
|
*/
|
2020-03-25 20:45:34 +08:00
|
|
|
if (!perf_missing_features.cgroup && evsel->core.attr.cgroup) {
|
|
|
|
perf_missing_features.cgroup = true;
|
|
|
|
pr_debug2_peo("Kernel has no cgroup sampling support, bailing out\n");
|
|
|
|
goto out_close;
|
|
|
|
} else if (!perf_missing_features.branch_hw_idx &&
|
perf evsel: Support PERF_SAMPLE_BRANCH_HW_INDEX
A new branch sample type PERF_SAMPLE_BRANCH_HW_INDEX has been introduced
in latest kernel.
Enable HW_INDEX by default in LBR call stack mode.
If kernel doesn't support the sample type, switching it off.
Add HW_INDEX in attr_fprintf as well. User can check whether the branch
sample type is set via debug information or header.
Committer testing:
First collect some samples with LBR callchains, system wide, for a few
seconds:
# perf record --call-graph lbr -a sleep 5
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.625 MB perf.data (224 samples) ]
#
Now lets use 'perf evlist -v' to look at the branch_sample_type:
# perf evlist -v
cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|BRANCH_STACK, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, branch_sample_type: USER|CALL_STACK|NO_FLAGS|NO_CYCLES|HW_INDEX
#
So the machine has the kernel feature, and it was correctly added to
perf_event_attr.branch_sample_type, for the default 'cycles' event.
If we do it in another machine, where the kernel lacks the HW_INDEX
feature, we get:
# perf record --call-graph lbr -a sleep 2s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 1.690 MB perf.data (499 samples) ]
# perf evlist -v
cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|BRANCH_STACK, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, branch_sample_type: USER|CALL_STACK|NO_FLAGS|NO_CYCLES
#
No HW_INDEX in attr.branch_sample_type.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexey Budankov <alexey.budankov@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Pavel Gerasimov <pavel.gerasimov@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vitaly Slobodskoy <vitaly.slobodskoy@intel.com>
Link: http://lore.kernel.org/lkml/20200228163011.19358-3-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2020-02-29 00:30:01 +08:00
|
|
|
(evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX)) {
|
|
|
|
perf_missing_features.branch_hw_idx = true;
|
|
|
|
pr_debug2("switching off branch HW index support\n");
|
|
|
|
goto fallback_missing_features;
|
|
|
|
} else if (!perf_missing_features.aux_output && evsel->core.attr.aux_output) {
|
2019-08-13 22:06:38 +08:00
|
|
|
perf_missing_features.aux_output = true;
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("Kernel has no attr.aux_output support, bailing out\n");
|
2019-08-13 22:06:38 +08:00
|
|
|
goto out_close;
|
2019-08-27 06:31:06 +08:00
|
|
|
} else if (!perf_missing_features.bpf && evsel->core.attr.bpf_event) {
|
|
|
|
perf_missing_features.bpf = true;
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("switching off bpf_event\n");
|
perf tools: Handle PERF_RECORD_BPF_EVENT
This patch adds basic handling of PERF_RECORD_BPF_EVENT. Tracking of
PERF_RECORD_BPF_EVENT is OFF by default. Option --bpf-event is added to
turn it on.
Committer notes:
Add dummy machine__process_bpf_event() variant that returns zero for
systems without HAVE_LIBBPF_SUPPORT, such as Alpine Linux, unbreaking
the build in such systems.
Remove the needless include <machine.h> from bpf->event.h, provide just
forward declarations for the structs and unions in the parameters, to
reduce compilation time and needless rebuilds when machine.h gets
changed.
Committer testing:
When running with:
# perf record --bpf-event
On an older kernel where PERF_RECORD_BPF_EVENT and PERF_RECORD_KSYMBOL
is not present, we fallback to removing those two bits from
perf_event_attr, making the tool to continue to work on older kernels:
perf_event_attr:
size 112
{ sample_period, sample_freq } 4000
sample_type IP|TID|TIME|PERIOD
read_format ID
disabled 1
inherit 1
mmap 1
comm 1
freq 1
enable_on_exec 1
task 1
precise_ip 3
sample_id_all 1
exclude_guest 1
mmap2 1
comm_exec 1
ksymbol 1
bpf_event 1
------------------------------------------------------------
sys_perf_event_open: pid 5779 cpu 0 group_fd -1 flags 0x8
sys_perf_event_open failed, error -22
switching off bpf_event
------------------------------------------------------------
perf_event_attr:
size 112
{ sample_period, sample_freq } 4000
sample_type IP|TID|TIME|PERIOD
read_format ID
disabled 1
inherit 1
mmap 1
comm 1
freq 1
enable_on_exec 1
task 1
precise_ip 3
sample_id_all 1
exclude_guest 1
mmap2 1
comm_exec 1
ksymbol 1
------------------------------------------------------------
sys_perf_event_open: pid 5779 cpu 0 group_fd -1 flags 0x8
sys_perf_event_open failed, error -22
switching off ksymbol
------------------------------------------------------------
perf_event_attr:
size 112
{ sample_period, sample_freq } 4000
sample_type IP|TID|TIME|PERIOD
read_format ID
disabled 1
inherit 1
mmap 1
comm 1
freq 1
enable_on_exec 1
task 1
precise_ip 3
sample_id_all 1
exclude_guest 1
mmap2 1
comm_exec 1
------------------------------------------------------------
And then proceeds to work without those two features.
As passing --bpf-event is an explicit action performed by the user, perhaps we
should emit a warning telling that the kernel has no such feature, but this can
be done on top of this patch.
Now with a kernel that supports these events, start the 'record --bpf-event -a'
and then run 'perf trace sleep 10000' that will use the BPF
augmented_raw_syscalls.o prebuilt (for another kernel version even) and thus
should generate PERF_RECORD_BPF_EVENT events:
[root@quaco ~]# perf record -e dummy -a --bpf-event
^C[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.713 MB perf.data ]
[root@quaco ~]# bpftool prog
13: cgroup_skb tag 7be49e3934a125ba gpl
loaded_at 2019-01-19T09:09:43-0300 uid 0
xlated 296B jited 229B memlock 4096B map_ids 13,14
14: cgroup_skb tag 2a142ef67aaad174 gpl
loaded_at 2019-01-19T09:09:43-0300 uid 0
xlated 296B jited 229B memlock 4096B map_ids 13,14
15: cgroup_skb tag 7be49e3934a125ba gpl
loaded_at 2019-01-19T09:09:43-0300 uid 0
xlated 296B jited 229B memlock 4096B map_ids 15,16
16: cgroup_skb tag 2a142ef67aaad174 gpl
loaded_at 2019-01-19T09:09:43-0300 uid 0
xlated 296B jited 229B memlock 4096B map_ids 15,16
17: cgroup_skb tag 7be49e3934a125ba gpl
loaded_at 2019-01-19T09:09:44-0300 uid 0
xlated 296B jited 229B memlock 4096B map_ids 17,18
18: cgroup_skb tag 2a142ef67aaad174 gpl
loaded_at 2019-01-19T09:09:44-0300 uid 0
xlated 296B jited 229B memlock 4096B map_ids 17,18
21: cgroup_skb tag 7be49e3934a125ba gpl
loaded_at 2019-01-19T09:09:45-0300 uid 0
xlated 296B jited 229B memlock 4096B map_ids 21,22
22: cgroup_skb tag 2a142ef67aaad174 gpl
loaded_at 2019-01-19T09:09:45-0300 uid 0
xlated 296B jited 229B memlock 4096B map_ids 21,22
31: tracepoint name sys_enter tag 12504ba9402f952f gpl
loaded_at 2019-01-19T09:19:56-0300 uid 0
xlated 512B jited 374B memlock 4096B map_ids 30,29,28
32: tracepoint name sys_exit tag c1bd85c092d6e4aa gpl
loaded_at 2019-01-19T09:19:56-0300 uid 0
xlated 256B jited 191B memlock 4096B map_ids 30,29
# perf report -D | grep PERF_RECORD_BPF_EVENT | nl
1 0 55834574849 0x4fc8 [0x18]: PERF_RECORD_BPF_EVENT bpf event with type 1, flags 0, id 13
2 0 60129542145 0x5118 [0x18]: PERF_RECORD_BPF_EVENT bpf event with type 1, flags 0, id 14
3 0 64424509441 0x5268 [0x18]: PERF_RECORD_BPF_EVENT bpf event with type 1, flags 0, id 15
4 0 68719476737 0x53b8 [0x18]: PERF_RECORD_BPF_EVENT bpf event with type 1, flags 0, id 16
5 0 73014444033 0x5508 [0x18]: PERF_RECORD_BPF_EVENT bpf event with type 1, flags 0, id 17
6 0 77309411329 0x5658 [0x18]: PERF_RECORD_BPF_EVENT bpf event with type 1, flags 0, id 18
7 0 90194313217 0x57a8 [0x18]: PERF_RECORD_BPF_EVENT bpf event with type 1, flags 0, id 21
8 0 94489280513 0x58f8 [0x18]: PERF_RECORD_BPF_EVENT bpf event with type 1, flags 0, id 22
9 7 620922484360 0xb6390 [0x30]: PERF_RECORD_BPF_EVENT bpf event with type 1, flags 0, id 29
10 7 620922486018 0xb6410 [0x30]: PERF_RECORD_BPF_EVENT bpf event with type 2, flags 0, id 29
11 7 620922579199 0xb6490 [0x30]: PERF_RECORD_BPF_EVENT bpf event with type 1, flags 0, id 30
12 7 620922580240 0xb6510 [0x30]: PERF_RECORD_BPF_EVENT bpf event with type 2, flags 0, id 30
13 7 620922765207 0xb6598 [0x30]: PERF_RECORD_BPF_EVENT bpf event with type 1, flags 0, id 31
14 7 620922874543 0xb6620 [0x30]: PERF_RECORD_BPF_EVENT bpf event with type 1, flags 0, id 32
#
There, the 31 and 32 tracepoint BPF programs put in place by 'perf trace'.
Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: kernel-team@fb.com
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/20190117161521.1341602-7-songliubraving@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2019-01-18 00:15:18 +08:00
|
|
|
goto fallback_missing_features;
|
2019-07-21 19:24:29 +08:00
|
|
|
} else if (!perf_missing_features.ksymbol && evsel->core.attr.ksymbol) {
|
2019-01-18 00:15:17 +08:00
|
|
|
perf_missing_features.ksymbol = true;
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("switching off ksymbol\n");
|
2019-01-18 00:15:17 +08:00
|
|
|
goto fallback_missing_features;
|
2019-07-21 19:24:29 +08:00
|
|
|
} else if (!perf_missing_features.write_backward && evsel->core.attr.write_backward) {
|
2016-06-20 18:47:18 +08:00
|
|
|
perf_missing_features.write_backward = true;
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("switching off write_backward\n");
|
2016-07-14 16:34:33 +08:00
|
|
|
goto out_close;
|
2019-07-21 19:24:29 +08:00
|
|
|
} else if (!perf_missing_features.clockid_wrong && evsel->core.attr.use_clockid) {
|
2015-03-31 06:19:31 +08:00
|
|
|
perf_missing_features.clockid_wrong = true;
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("switching off clockid\n");
|
2015-03-31 06:19:31 +08:00
|
|
|
goto fallback_missing_features;
|
2019-07-21 19:24:29 +08:00
|
|
|
} else if (!perf_missing_features.clockid && evsel->core.attr.use_clockid) {
|
2015-03-31 06:19:31 +08:00
|
|
|
perf_missing_features.clockid = true;
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("switching off use_clockid\n");
|
2015-03-31 06:19:31 +08:00
|
|
|
goto fallback_missing_features;
|
|
|
|
} else if (!perf_missing_features.cloexec && (flags & PERF_FLAG_FD_CLOEXEC)) {
|
2014-07-01 04:28:47 +08:00
|
|
|
perf_missing_features.cloexec = true;
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("switching off cloexec flag\n");
|
2014-07-01 04:28:47 +08:00
|
|
|
goto fallback_missing_features;
|
2019-07-21 19:24:29 +08:00
|
|
|
} else if (!perf_missing_features.mmap2 && evsel->core.attr.mmap2) {
|
2013-08-21 18:10:25 +08:00
|
|
|
perf_missing_features.mmap2 = true;
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("switching off mmap2\n");
|
2013-08-21 18:10:25 +08:00
|
|
|
goto fallback_missing_features;
|
|
|
|
} else if (!perf_missing_features.exclude_guest &&
|
2019-07-21 19:24:29 +08:00
|
|
|
(evsel->core.attr.exclude_guest || evsel->core.attr.exclude_host)) {
|
2012-12-14 00:13:07 +08:00
|
|
|
perf_missing_features.exclude_guest = true;
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("switching off exclude_guest, exclude_host\n");
|
2012-12-14 00:13:07 +08:00
|
|
|
goto fallback_missing_features;
|
|
|
|
} else if (!perf_missing_features.sample_id_all) {
|
|
|
|
perf_missing_features.sample_id_all = true;
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("switching off sample_id_all\n");
|
2012-12-14 00:13:07 +08:00
|
|
|
goto retry_sample_id;
|
2015-12-12 08:12:24 +08:00
|
|
|
} else if (!perf_missing_features.lbr_flags &&
|
2019-07-21 19:24:29 +08:00
|
|
|
(evsel->core.attr.branch_sample_type &
|
2015-12-12 08:12:24 +08:00
|
|
|
(PERF_SAMPLE_BRANCH_NO_CYCLES |
|
|
|
|
PERF_SAMPLE_BRANCH_NO_FLAGS))) {
|
|
|
|
perf_missing_features.lbr_flags = true;
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("switching off branch sample type no (cycles/flags)\n");
|
2015-12-12 08:12:24 +08:00
|
|
|
goto fallback_missing_features;
|
perf stat: Use group read for event groups
Make perf stat use group read if there are groups defined. The group
read will get the values for all member of groups within a single
syscall instead of calling read syscall for every event.
We can see considerable less amount of kernel cycles spent on single
group read, than reading each event separately, like for following perf
stat command:
# perf stat -e {cycles,instructions} -I 10 -a sleep 1
Monitored with "perf stat -r 5 -e '{cycles:u,cycles:k}'"
Before:
24,325,676 cycles:u
297,040,775 cycles:k
1.038554134 seconds time elapsed
After:
25,034,418 cycles:u
158,256,395 cycles:k
1.036864497 seconds time elapsed
The perf_evsel__open fallback changes contributed by Andi Kleen.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20170726120206.9099-4-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-07-26 20:02:06 +08:00
|
|
|
} else if (!perf_missing_features.group_read &&
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.inherit &&
|
|
|
|
(evsel->core.attr.read_format & PERF_FORMAT_GROUP) &&
|
2020-04-30 21:51:16 +08:00
|
|
|
evsel__is_group_leader(evsel)) {
|
perf stat: Use group read for event groups
Make perf stat use group read if there are groups defined. The group
read will get the values for all member of groups within a single
syscall instead of calling read syscall for every event.
We can see considerable less amount of kernel cycles spent on single
group read, than reading each event separately, like for following perf
stat command:
# perf stat -e {cycles,instructions} -I 10 -a sleep 1
Monitored with "perf stat -r 5 -e '{cycles:u,cycles:k}'"
Before:
24,325,676 cycles:u
297,040,775 cycles:k
1.038554134 seconds time elapsed
After:
25,034,418 cycles:u
158,256,395 cycles:k
1.036864497 seconds time elapsed
The perf_evsel__open fallback changes contributed by Andi Kleen.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20170726120206.9099-4-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-07-26 20:02:06 +08:00
|
|
|
perf_missing_features.group_read = true;
|
2019-11-08 17:41:28 +08:00
|
|
|
pr_debug2_peo("switching off group read\n");
|
perf stat: Use group read for event groups
Make perf stat use group read if there are groups defined. The group
read will get the values for all member of groups within a single
syscall instead of calling read syscall for every event.
We can see considerable less amount of kernel cycles spent on single
group read, than reading each event separately, like for following perf
stat command:
# perf stat -e {cycles,instructions} -I 10 -a sleep 1
Monitored with "perf stat -r 5 -e '{cycles:u,cycles:k}'"
Before:
24,325,676 cycles:u
297,040,775 cycles:k
1.038554134 seconds time elapsed
After:
25,034,418 cycles:u
158,256,395 cycles:k
1.036864497 seconds time elapsed
The perf_evsel__open fallback changes contributed by Andi Kleen.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20170726120206.9099-4-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-07-26 20:02:06 +08:00
|
|
|
goto fallback_missing_features;
|
2012-12-14 00:13:07 +08:00
|
|
|
}
|
2011-01-04 03:48:12 +08:00
|
|
|
out_close:
|
perf stat: Ignore error thread when enabling system-wide --per-thread
If we execute 'perf stat --per-thread' with non-root account (even set
kernel.perf_event_paranoid = -1 yet), it reports the error:
jinyao@skl:~$ perf stat --per-thread
Error:
You may not have permission to collect system-wide stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
which controls use of the performance events system by
unprivileged users (without CAP_SYS_ADMIN).
The current value is 2:
-1: Allow use of (almost) all events by all users
Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK
>= 0: Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN
Disallow raw tracepoint access by users without CAP_SYS_ADMIN
>= 1: Disallow CPU event access by users without CAP_SYS_ADMIN
>= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN
To make this setting permanent, edit /etc/sysctl.conf too, e.g.:
kernel.perf_event_paranoid = -1
Perhaps the ptrace rule doesn't allow to trace some processes. But anyway
the global --per-thread mode had better ignore such errors and continue
working on other threads.
This patch will record the index of error thread in perf_evsel__open()
and remove this thread before retrying.
For example (run with non-root, kernel.perf_event_paranoid isn't set):
jinyao@skl:~$ perf stat --per-thread
^C
Performance counter stats for 'system wide':
vmstat-3458 6.171984 cpu-clock:u (msec) # 0.000 CPUs utilized
perf-3670 0.515599 cpu-clock:u (msec) # 0.000 CPUs utilized
vmstat-3458 1,163,643 cycles:u # 0.189 GHz
perf-3670 40,881 cycles:u # 0.079 GHz
vmstat-3458 1,410,238 instructions:u # 1.21 insn per cycle
perf-3670 3,536 instructions:u # 0.09 insn per cycle
vmstat-3458 288,937 branches:u # 46.814 M/sec
perf-3670 936 branches:u # 1.815 M/sec
vmstat-3458 15,195 branch-misses:u # 5.26% of all branches
perf-3670 76 branch-misses:u # 8.12% of all branches
12.651675247 seconds time elapsed
Signed-off-by: Jin Yao <yao.jin@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1516117388-10120-1-git-send-email-yao.jin@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2018-01-16 23:43:08 +08:00
|
|
|
if (err)
|
|
|
|
threads->err_thread = thread;
|
|
|
|
|
2019-10-21 01:51:54 +08:00
|
|
|
old_errno = errno;
|
2011-01-04 21:55:27 +08:00
|
|
|
do {
|
|
|
|
while (--thread >= 0) {
|
2019-10-21 01:51:55 +08:00
|
|
|
if (FD(evsel, cpu, thread) >= 0)
|
|
|
|
close(FD(evsel, cpu, thread));
|
2011-01-04 21:55:27 +08:00
|
|
|
FD(evsel, cpu, thread) = -1;
|
|
|
|
}
|
2014-07-31 14:00:51 +08:00
|
|
|
thread = nthreads;
|
2011-01-04 21:55:27 +08:00
|
|
|
} while (--cpu >= 0);
|
2019-10-21 01:51:54 +08:00
|
|
|
errno = old_errno;
|
2011-10-25 20:42:19 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2019-11-21 08:15:19 +08:00
|
|
|
int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus,
|
|
|
|
struct perf_thread_map *threads)
|
|
|
|
{
|
|
|
|
return evsel__open_cpu(evsel, cpus, threads, 0, cpus ? cpus->nr : 1);
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:24:50 +08:00
|
|
|
void evsel__close(struct evsel *evsel)
|
2011-10-25 20:42:19 +08:00
|
|
|
{
|
2019-07-21 19:24:50 +08:00
|
|
|
perf_evsel__close(&evsel->core);
|
2019-09-03 16:34:29 +08:00
|
|
|
perf_evsel__free_id(&evsel->core);
|
2011-01-04 03:48:12 +08:00
|
|
|
}
|
|
|
|
|
2020-04-30 03:21:03 +08:00
|
|
|
int evsel__open_per_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, int cpu)
|
2011-01-04 03:48:12 +08:00
|
|
|
{
|
2019-11-21 08:15:19 +08:00
|
|
|
if (cpu == -1)
|
|
|
|
return evsel__open_cpu(evsel, cpus, NULL, 0,
|
|
|
|
cpus ? cpus->nr : 1);
|
|
|
|
|
|
|
|
return evsel__open_cpu(evsel, cpus, NULL, cpu, cpu + 1);
|
2011-01-04 21:55:27 +08:00
|
|
|
}
|
2011-01-04 03:48:12 +08:00
|
|
|
|
2020-04-30 03:21:03 +08:00
|
|
|
int evsel__open_per_thread(struct evsel *evsel, struct perf_thread_map *threads)
|
2011-01-04 21:55:27 +08:00
|
|
|
{
|
2019-07-21 19:24:01 +08:00
|
|
|
return evsel__open(evsel, NULL, threads);
|
2011-01-04 03:48:12 +08:00
|
|
|
}
|
2011-01-13 03:03:24 +08:00
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
static int perf_evsel__parse_id_sample(const struct evsel *evsel,
|
2012-09-26 23:48:18 +08:00
|
|
|
const union perf_event *event,
|
|
|
|
struct perf_sample *sample)
|
2011-01-21 23:46:41 +08:00
|
|
|
{
|
2019-07-21 19:24:29 +08:00
|
|
|
u64 type = evsel->core.attr.sample_type;
|
2019-08-26 02:17:52 +08:00
|
|
|
const __u64 *array = event->sample.array;
|
2012-09-26 23:48:18 +08:00
|
|
|
bool swapped = evsel->needs_swap;
|
2012-05-30 20:23:44 +08:00
|
|
|
union u64_swap u;
|
2011-01-21 23:46:41 +08:00
|
|
|
|
|
|
|
array += ((event->header.size -
|
|
|
|
sizeof(event->header)) / sizeof(u64)) - 1;
|
|
|
|
|
2013-08-27 16:23:09 +08:00
|
|
|
if (type & PERF_SAMPLE_IDENTIFIER) {
|
|
|
|
sample->id = *array;
|
|
|
|
array--;
|
|
|
|
}
|
|
|
|
|
2011-01-21 23:46:41 +08:00
|
|
|
if (type & PERF_SAMPLE_CPU) {
|
2012-05-30 20:23:44 +08:00
|
|
|
u.val64 = *array;
|
|
|
|
if (swapped) {
|
|
|
|
/* undo swap of u64, then swap on individual u32s */
|
|
|
|
u.val64 = bswap_64(u.val64);
|
|
|
|
u.val32[0] = bswap_32(u.val32[0]);
|
|
|
|
}
|
|
|
|
|
|
|
|
sample->cpu = u.val32[0];
|
2011-01-21 23:46:41 +08:00
|
|
|
array--;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_STREAM_ID) {
|
|
|
|
sample->stream_id = *array;
|
|
|
|
array--;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_ID) {
|
|
|
|
sample->id = *array;
|
|
|
|
array--;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_TIME) {
|
|
|
|
sample->time = *array;
|
|
|
|
array--;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_TID) {
|
2012-05-30 20:23:44 +08:00
|
|
|
u.val64 = *array;
|
|
|
|
if (swapped) {
|
|
|
|
/* undo swap of u64, then swap on individual u32s */
|
|
|
|
u.val64 = bswap_64(u.val64);
|
|
|
|
u.val32[0] = bswap_32(u.val32[0]);
|
|
|
|
u.val32[1] = bswap_32(u.val32[1]);
|
|
|
|
}
|
|
|
|
|
|
|
|
sample->pid = u.val32[0];
|
|
|
|
sample->tid = u.val32[1];
|
2013-10-18 20:29:01 +08:00
|
|
|
array--;
|
2011-01-21 23:46:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-08-27 16:23:04 +08:00
|
|
|
static inline bool overflow(const void *endp, u16 max_size, const void *offset,
|
|
|
|
u64 size)
|
2011-05-22 02:08:15 +08:00
|
|
|
{
|
2013-08-27 16:23:04 +08:00
|
|
|
return size > max_size || offset + size > endp;
|
|
|
|
}
|
2011-05-22 02:08:15 +08:00
|
|
|
|
2013-08-27 16:23:04 +08:00
|
|
|
#define OVERFLOW_CHECK(offset, size, max_size) \
|
|
|
|
do { \
|
|
|
|
if (overflow(endp, (max_size), (offset), (size))) \
|
|
|
|
return -EFAULT; \
|
|
|
|
} while (0)
|
2011-05-22 02:08:15 +08:00
|
|
|
|
2013-08-27 16:23:04 +08:00
|
|
|
#define OVERFLOW_CHECK_u64(offset) \
|
|
|
|
OVERFLOW_CHECK(offset, sizeof(u64), sizeof(u64))
|
2011-05-22 02:08:15 +08:00
|
|
|
|
2017-08-03 19:10:28 +08:00
|
|
|
static int
|
|
|
|
perf_event__check_size(union perf_event *event, unsigned int sample_size)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The evsel's sample_size is based on PERF_SAMPLE_MASK which includes
|
|
|
|
* up to PERF_SAMPLE_PERIOD. After that overflow() must be used to
|
|
|
|
* check the format does not go past the end of the event.
|
|
|
|
*/
|
|
|
|
if (sample_size + sizeof(event->header) > event->header.size)
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event,
|
2012-09-26 23:48:18 +08:00
|
|
|
struct perf_sample *data)
|
2011-01-21 23:46:41 +08:00
|
|
|
{
|
2019-07-21 19:24:29 +08:00
|
|
|
u64 type = evsel->core.attr.sample_type;
|
2012-09-26 23:48:18 +08:00
|
|
|
bool swapped = evsel->needs_swap;
|
2019-08-26 02:17:52 +08:00
|
|
|
const __u64 *array;
|
2013-08-27 16:23:04 +08:00
|
|
|
u16 max_size = event->header.size;
|
|
|
|
const void *endp = (void *)event + max_size;
|
|
|
|
u64 sz;
|
2011-01-21 23:46:41 +08:00
|
|
|
|
2011-09-06 23:12:26 +08:00
|
|
|
/*
|
|
|
|
* used for cross-endian analysis. See git commit 65014ab3
|
|
|
|
* for why this goofiness is needed.
|
|
|
|
*/
|
2012-05-16 14:59:04 +08:00
|
|
|
union u64_swap u;
|
2011-09-06 23:12:26 +08:00
|
|
|
|
2011-12-16 00:32:39 +08:00
|
|
|
memset(data, 0, sizeof(*data));
|
2011-01-21 23:46:41 +08:00
|
|
|
data->cpu = data->pid = data->tid = -1;
|
|
|
|
data->stream_id = data->id = data->time = -1ULL;
|
2019-07-21 19:24:29 +08:00
|
|
|
data->period = evsel->core.attr.sample_period;
|
2016-03-23 05:23:43 +08:00
|
|
|
data->cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
|
2018-01-08 00:03:52 +08:00
|
|
|
data->misc = event->header.misc;
|
2017-08-03 22:07:05 +08:00
|
|
|
data->id = -1ULL;
|
|
|
|
data->data_src = PERF_MEM_DATA_SRC_NONE;
|
2011-01-21 23:46:41 +08:00
|
|
|
|
|
|
|
if (event->header.type != PERF_RECORD_SAMPLE) {
|
2019-07-21 19:24:29 +08:00
|
|
|
if (!evsel->core.attr.sample_id_all)
|
2011-01-21 23:46:41 +08:00
|
|
|
return 0;
|
2012-09-26 23:48:18 +08:00
|
|
|
return perf_evsel__parse_id_sample(evsel, event, data);
|
2011-01-21 23:46:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
array = event->sample.array;
|
|
|
|
|
2017-08-03 19:10:28 +08:00
|
|
|
if (perf_event__check_size(event, evsel->sample_size))
|
2011-05-22 01:33:04 +08:00
|
|
|
return -EFAULT;
|
|
|
|
|
2013-08-27 16:23:09 +08:00
|
|
|
if (type & PERF_SAMPLE_IDENTIFIER) {
|
|
|
|
data->id = *array;
|
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
2011-01-21 23:46:41 +08:00
|
|
|
if (type & PERF_SAMPLE_IP) {
|
2013-08-27 16:23:06 +08:00
|
|
|
data->ip = *array;
|
2011-01-21 23:46:41 +08:00
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_TID) {
|
2011-09-06 23:12:26 +08:00
|
|
|
u.val64 = *array;
|
|
|
|
if (swapped) {
|
|
|
|
/* undo swap of u64, then swap on individual u32s */
|
|
|
|
u.val64 = bswap_64(u.val64);
|
|
|
|
u.val32[0] = bswap_32(u.val32[0]);
|
|
|
|
u.val32[1] = bswap_32(u.val32[1]);
|
|
|
|
}
|
|
|
|
|
|
|
|
data->pid = u.val32[0];
|
|
|
|
data->tid = u.val32[1];
|
2011-01-21 23:46:41 +08:00
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_TIME) {
|
|
|
|
data->time = *array;
|
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_ADDR) {
|
|
|
|
data->addr = *array;
|
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_ID) {
|
|
|
|
data->id = *array;
|
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_STREAM_ID) {
|
|
|
|
data->stream_id = *array;
|
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_CPU) {
|
2011-09-06 23:12:26 +08:00
|
|
|
|
|
|
|
u.val64 = *array;
|
|
|
|
if (swapped) {
|
|
|
|
/* undo swap of u64, then swap on individual u32s */
|
|
|
|
u.val64 = bswap_64(u.val64);
|
|
|
|
u.val32[0] = bswap_32(u.val32[0]);
|
|
|
|
}
|
|
|
|
|
|
|
|
data->cpu = u.val32[0];
|
2011-01-21 23:46:41 +08:00
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_PERIOD) {
|
|
|
|
data->period = *array;
|
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_READ) {
|
2019-07-21 19:24:29 +08:00
|
|
|
u64 read_format = evsel->core.attr.read_format;
|
2012-10-10 23:38:13 +08:00
|
|
|
|
2013-08-27 16:23:04 +08:00
|
|
|
OVERFLOW_CHECK_u64(array);
|
2012-10-10 23:38:13 +08:00
|
|
|
if (read_format & PERF_FORMAT_GROUP)
|
|
|
|
data->read.group.nr = *array;
|
|
|
|
else
|
|
|
|
data->read.one.value = *array;
|
|
|
|
|
|
|
|
array++;
|
|
|
|
|
|
|
|
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
|
2013-08-27 16:23:04 +08:00
|
|
|
OVERFLOW_CHECK_u64(array);
|
2012-10-10 23:38:13 +08:00
|
|
|
data->read.time_enabled = *array;
|
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
|
2013-08-27 16:23:04 +08:00
|
|
|
OVERFLOW_CHECK_u64(array);
|
2012-10-10 23:38:13 +08:00
|
|
|
data->read.time_running = *array;
|
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */
|
|
|
|
if (read_format & PERF_FORMAT_GROUP) {
|
2013-08-27 16:23:04 +08:00
|
|
|
const u64 max_group_nr = UINT64_MAX /
|
|
|
|
sizeof(struct sample_read_value);
|
|
|
|
|
|
|
|
if (data->read.group.nr > max_group_nr)
|
|
|
|
return -EFAULT;
|
|
|
|
sz = data->read.group.nr *
|
|
|
|
sizeof(struct sample_read_value);
|
|
|
|
OVERFLOW_CHECK(array, sz, max_size);
|
|
|
|
data->read.group.values =
|
|
|
|
(struct sample_read_value *)array;
|
|
|
|
array = (void *)array + sz;
|
2012-10-10 23:38:13 +08:00
|
|
|
} else {
|
2013-08-27 16:23:04 +08:00
|
|
|
OVERFLOW_CHECK_u64(array);
|
2012-10-10 23:38:13 +08:00
|
|
|
data->read.one.id = *array;
|
|
|
|
array++;
|
|
|
|
}
|
2011-01-21 23:46:41 +08:00
|
|
|
}
|
|
|
|
|
2020-04-01 18:16:07 +08:00
|
|
|
if (type & PERF_SAMPLE_CALLCHAIN) {
|
2013-08-27 16:23:04 +08:00
|
|
|
const u64 max_callchain_nr = UINT64_MAX / sizeof(u64);
|
2011-05-22 02:08:15 +08:00
|
|
|
|
2013-08-27 16:23:04 +08:00
|
|
|
OVERFLOW_CHECK_u64(array);
|
|
|
|
data->callchain = (struct ip_callchain *)array++;
|
|
|
|
if (data->callchain->nr > max_callchain_nr)
|
2011-05-22 02:08:15 +08:00
|
|
|
return -EFAULT;
|
2013-08-27 16:23:04 +08:00
|
|
|
sz = data->callchain->nr * sizeof(u64);
|
|
|
|
OVERFLOW_CHECK(array, sz, max_size);
|
|
|
|
array = (void *)array + sz;
|
2011-01-21 23:46:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_RAW) {
|
2013-08-27 16:23:04 +08:00
|
|
|
OVERFLOW_CHECK_u64(array);
|
2011-09-06 23:12:26 +08:00
|
|
|
u.val64 = *array;
|
2017-11-30 02:43:46 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Undo swap of u64, then swap on individual u32s,
|
|
|
|
* get the size of the raw area and undo all of the
|
|
|
|
* swap. The pevent interface handles endianity by
|
|
|
|
* itself.
|
|
|
|
*/
|
|
|
|
if (swapped) {
|
2011-09-06 23:12:26 +08:00
|
|
|
u.val64 = bswap_64(u.val64);
|
|
|
|
u.val32[0] = bswap_32(u.val32[0]);
|
|
|
|
u.val32[1] = bswap_32(u.val32[1]);
|
|
|
|
}
|
|
|
|
data->raw_size = u.val32[0];
|
2017-11-30 02:43:46 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The raw data is aligned on 64bits including the
|
|
|
|
* u32 size, so it's safe to use mem_bswap_64.
|
|
|
|
*/
|
|
|
|
if (swapped)
|
|
|
|
mem_bswap_64((void *) array, data->raw_size);
|
|
|
|
|
2013-08-27 16:23:04 +08:00
|
|
|
array = (void *)array + sizeof(u32);
|
2011-05-22 02:08:15 +08:00
|
|
|
|
2013-08-27 16:23:04 +08:00
|
|
|
OVERFLOW_CHECK(array, data->raw_size, max_size);
|
|
|
|
data->raw_data = (void *)array;
|
|
|
|
array = (void *)array + data->raw_size;
|
2011-01-21 23:46:41 +08:00
|
|
|
}
|
|
|
|
|
2012-02-10 06:21:01 +08:00
|
|
|
if (type & PERF_SAMPLE_BRANCH_STACK) {
|
2013-08-27 16:23:04 +08:00
|
|
|
const u64 max_branch_nr = UINT64_MAX /
|
|
|
|
sizeof(struct branch_entry);
|
2012-02-10 06:21:01 +08:00
|
|
|
|
2013-08-27 16:23:04 +08:00
|
|
|
OVERFLOW_CHECK_u64(array);
|
|
|
|
data->branch_stack = (struct branch_stack *)array++;
|
2012-02-10 06:21:01 +08:00
|
|
|
|
2013-08-27 16:23:04 +08:00
|
|
|
if (data->branch_stack->nr > max_branch_nr)
|
|
|
|
return -EFAULT;
|
2020-02-29 00:30:00 +08:00
|
|
|
|
2012-02-10 06:21:01 +08:00
|
|
|
sz = data->branch_stack->nr * sizeof(struct branch_entry);
|
2020-02-29 00:30:00 +08:00
|
|
|
if (perf_evsel__has_branch_hw_idx(evsel))
|
|
|
|
sz += sizeof(u64);
|
|
|
|
else
|
|
|
|
data->no_hw_idx = true;
|
2013-08-27 16:23:04 +08:00
|
|
|
OVERFLOW_CHECK(array, sz, max_size);
|
|
|
|
array = (void *)array + sz;
|
2012-02-10 06:21:01 +08:00
|
|
|
}
|
2012-08-07 21:20:45 +08:00
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_REGS_USER) {
|
2013-08-27 16:23:04 +08:00
|
|
|
OVERFLOW_CHECK_u64(array);
|
2013-08-27 16:23:10 +08:00
|
|
|
data->user_regs.abi = *array;
|
|
|
|
array++;
|
2012-08-07 21:20:45 +08:00
|
|
|
|
2013-08-27 16:23:10 +08:00
|
|
|
if (data->user_regs.abi) {
|
2019-07-21 19:24:29 +08:00
|
|
|
u64 mask = evsel->core.attr.sample_regs_user;
|
2013-08-27 16:23:04 +08:00
|
|
|
|
2019-04-10 16:16:43 +08:00
|
|
|
sz = hweight64(mask) * sizeof(u64);
|
2013-08-27 16:23:04 +08:00
|
|
|
OVERFLOW_CHECK(array, sz, max_size);
|
2014-01-07 20:47:25 +08:00
|
|
|
data->user_regs.mask = mask;
|
2012-08-07 21:20:45 +08:00
|
|
|
data->user_regs.regs = (u64 *)array;
|
2013-08-27 16:23:04 +08:00
|
|
|
array = (void *)array + sz;
|
2012-08-07 21:20:45 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_STACK_USER) {
|
2013-08-27 16:23:04 +08:00
|
|
|
OVERFLOW_CHECK_u64(array);
|
|
|
|
sz = *array++;
|
2012-08-07 21:20:45 +08:00
|
|
|
|
|
|
|
data->user_stack.offset = ((char *)(array - 1)
|
|
|
|
- (char *) event);
|
|
|
|
|
2013-08-27 16:23:04 +08:00
|
|
|
if (!sz) {
|
2012-08-07 21:20:45 +08:00
|
|
|
data->user_stack.size = 0;
|
|
|
|
} else {
|
2013-08-27 16:23:04 +08:00
|
|
|
OVERFLOW_CHECK(array, sz, max_size);
|
2012-08-07 21:20:45 +08:00
|
|
|
data->user_stack.data = (char *)array;
|
2013-08-27 16:23:04 +08:00
|
|
|
array = (void *)array + sz;
|
|
|
|
OVERFLOW_CHECK_u64(array);
|
2013-07-04 21:20:34 +08:00
|
|
|
data->user_stack.size = *array++;
|
2013-10-02 21:46:39 +08:00
|
|
|
if (WARN_ONCE(data->user_stack.size > sz,
|
|
|
|
"user stack dump failure\n"))
|
|
|
|
return -EFAULT;
|
2012-08-07 21:20:45 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-01-24 23:10:29 +08:00
|
|
|
if (type & PERF_SAMPLE_WEIGHT) {
|
2013-08-27 16:23:04 +08:00
|
|
|
OVERFLOW_CHECK_u64(array);
|
2013-01-24 23:10:29 +08:00
|
|
|
data->weight = *array;
|
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
2013-01-24 23:10:35 +08:00
|
|
|
if (type & PERF_SAMPLE_DATA_SRC) {
|
2013-08-27 16:23:04 +08:00
|
|
|
OVERFLOW_CHECK_u64(array);
|
2013-01-24 23:10:35 +08:00
|
|
|
data->data_src = *array;
|
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
2013-09-20 22:40:43 +08:00
|
|
|
if (type & PERF_SAMPLE_TRANSACTION) {
|
2013-11-01 21:51:36 +08:00
|
|
|
OVERFLOW_CHECK_u64(array);
|
2013-09-20 22:40:43 +08:00
|
|
|
data->transaction = *array;
|
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
2014-09-24 19:48:39 +08:00
|
|
|
data->intr_regs.abi = PERF_SAMPLE_REGS_ABI_NONE;
|
|
|
|
if (type & PERF_SAMPLE_REGS_INTR) {
|
|
|
|
OVERFLOW_CHECK_u64(array);
|
|
|
|
data->intr_regs.abi = *array;
|
|
|
|
array++;
|
|
|
|
|
|
|
|
if (data->intr_regs.abi != PERF_SAMPLE_REGS_ABI_NONE) {
|
2019-07-21 19:24:29 +08:00
|
|
|
u64 mask = evsel->core.attr.sample_regs_intr;
|
2014-09-24 19:48:39 +08:00
|
|
|
|
2019-04-10 16:16:43 +08:00
|
|
|
sz = hweight64(mask) * sizeof(u64);
|
2014-09-24 19:48:39 +08:00
|
|
|
OVERFLOW_CHECK(array, sz, max_size);
|
|
|
|
data->intr_regs.mask = mask;
|
|
|
|
data->intr_regs.regs = (u64 *)array;
|
|
|
|
array = (void *)array + sz;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-08-30 01:11:08 +08:00
|
|
|
data->phys_addr = 0;
|
|
|
|
if (type & PERF_SAMPLE_PHYS_ADDR) {
|
|
|
|
data->phys_addr = *array;
|
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
2020-03-25 20:45:30 +08:00
|
|
|
data->cgroup = 0;
|
|
|
|
if (type & PERF_SAMPLE_CGROUP) {
|
|
|
|
data->cgroup = *array;
|
|
|
|
array++;
|
|
|
|
}
|
|
|
|
|
2019-11-15 20:42:11 +08:00
|
|
|
if (type & PERF_SAMPLE_AUX) {
|
|
|
|
OVERFLOW_CHECK_u64(array);
|
|
|
|
sz = *array++;
|
|
|
|
|
|
|
|
OVERFLOW_CHECK(array, sz, max_size);
|
|
|
|
/* Undo swap of data */
|
|
|
|
if (swapped)
|
|
|
|
mem_bswap_64((char *)array, sz);
|
|
|
|
data->aux_sample.size = sz;
|
|
|
|
data->aux_sample.data = (char *)array;
|
|
|
|
array = (void *)array + sz;
|
|
|
|
}
|
|
|
|
|
2011-01-21 23:46:41 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2011-11-28 17:03:31 +08:00
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
int perf_evsel__parse_sample_timestamp(struct evsel *evsel,
|
2017-08-03 19:10:28 +08:00
|
|
|
union perf_event *event,
|
|
|
|
u64 *timestamp)
|
|
|
|
{
|
2019-07-21 19:24:29 +08:00
|
|
|
u64 type = evsel->core.attr.sample_type;
|
2019-08-26 02:17:52 +08:00
|
|
|
const __u64 *array;
|
2017-08-03 19:10:28 +08:00
|
|
|
|
|
|
|
if (!(type & PERF_SAMPLE_TIME))
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
if (event->header.type != PERF_RECORD_SAMPLE) {
|
|
|
|
struct perf_sample data = {
|
|
|
|
.time = -1ULL,
|
|
|
|
};
|
|
|
|
|
2019-07-21 19:24:29 +08:00
|
|
|
if (!evsel->core.attr.sample_id_all)
|
2017-08-03 19:10:28 +08:00
|
|
|
return -1;
|
|
|
|
if (perf_evsel__parse_id_sample(evsel, event, &data))
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
*timestamp = data.time;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
array = event->sample.array;
|
|
|
|
|
|
|
|
if (perf_event__check_size(event, evsel->sample_size))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_IDENTIFIER)
|
|
|
|
array++;
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_IP)
|
|
|
|
array++;
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_TID)
|
|
|
|
array++;
|
|
|
|
|
|
|
|
if (type & PERF_SAMPLE_TIME)
|
|
|
|
*timestamp = *array;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-04-30 03:26:57 +08:00
|
|
|
struct tep_format_field *evsel__field(struct evsel *evsel, const char *name)
|
2012-09-18 22:21:50 +08:00
|
|
|
{
|
2018-08-09 02:02:50 +08:00
|
|
|
return tep_find_field(evsel->tp_format, name);
|
2012-09-18 22:21:50 +08:00
|
|
|
}
|
|
|
|
|
2020-04-30 03:26:57 +08:00
|
|
|
void *evsel__rawptr(struct evsel *evsel, struct perf_sample *sample, const char *name)
|
2012-09-12 06:24:23 +08:00
|
|
|
{
|
2020-04-30 03:26:57 +08:00
|
|
|
struct tep_format_field *field = evsel__field(evsel, name);
|
2012-09-12 06:24:23 +08:00
|
|
|
int offset;
|
|
|
|
|
2012-09-18 22:21:50 +08:00
|
|
|
if (!field)
|
|
|
|
return NULL;
|
2012-09-12 06:24:23 +08:00
|
|
|
|
|
|
|
offset = field->offset;
|
|
|
|
|
2018-09-20 02:56:46 +08:00
|
|
|
if (field->flags & TEP_FIELD_IS_DYNAMIC) {
|
2012-09-12 06:24:23 +08:00
|
|
|
offset = *(int *)(sample->raw_data + field->offset);
|
|
|
|
offset &= 0xffff;
|
|
|
|
}
|
|
|
|
|
|
|
|
return sample->raw_data + offset;
|
|
|
|
}
|
|
|
|
|
2018-09-20 02:56:45 +08:00
|
|
|
u64 format_field__intval(struct tep_format_field *field, struct perf_sample *sample,
|
2016-05-31 23:47:46 +08:00
|
|
|
bool needs_swap)
|
2012-09-12 06:24:23 +08:00
|
|
|
{
|
2012-09-27 00:13:04 +08:00
|
|
|
u64 value;
|
2016-05-31 23:47:46 +08:00
|
|
|
void *ptr = sample->raw_data + field->offset;
|
2012-09-12 06:24:23 +08:00
|
|
|
|
2012-09-27 00:13:04 +08:00
|
|
|
switch (field->size) {
|
|
|
|
case 1:
|
|
|
|
return *(u8 *)ptr;
|
|
|
|
case 2:
|
|
|
|
value = *(u16 *)ptr;
|
|
|
|
break;
|
|
|
|
case 4:
|
|
|
|
value = *(u32 *)ptr;
|
|
|
|
break;
|
|
|
|
case 8:
|
perf timechart: Fix SIBGUS error on sparc64
perf timechart -T on sparc64 is terminating due to SIGBUS. Backtrace:
Program received signal SIGBUS, Bus error.
0x0000000000173d7c in perf_evsel__intval (evsel=<value optimized out>, sample=0x7feffffda28, name=0x289b28 "prev_state")
at util/evsel.c:1918
1918 util/evsel.c: No such file or directory.
in util/evsel.c
Missing separate debuginfos, use: debuginfo-install audit-libs-2.3.7-1.0.1.el6.sparc64 bzip2-libs-1.0.5-7.el6_0.sparc64 elfutils-libelf-0.155-2.0.3.el6.sparc64 elfutils-libs-0.155-2.0.3.el6.sparc64 glibc-2.12-1.132.0.8.el6_5.sparc64 numactl-2.0.7-8.el6.sparc64 python-libs-2.6.6-52.0.2.el6.sparc64 slang-2.2.1-1.el6.sparc64 xz-libs-4.999.9-0.3.beta.20091007git.el6.sparc64 zlib-1.2.3-29.el6.sparc64
(gdb) bt
0 0x0000000000173d7c in perf_evsel__intval (evsel=<value optimized out>, sample=0x7feffffda28,
name=0x289b28 "prev_state") at util/evsel.c:1918
1 0x0000000000123b94 in process_sample_sched_switch (tchart=0x7feffffe040, evsel=0x4ca850, sample=0x7feffffda28,
backtrace=0xc39010 "") at builtin-timechart.c:627
2 0x0000000000122828 in process_sample_event (tool=0x7feffffe040, event=<value optimized out>, sample=0x7feffffda28,
evsel=0x4ca850, machine=0x4c9c88) at builtin-timechart.c:569
Another extended load on unaligned pointer. As before fix by copying to
a temporary variable using memcpy.
Signed-off-by: David Ahern <david.ahern@oracle.com>
Link: http://lkml.kernel.org/r/1427228049-51893-1-git-send-email-david.ahern@oracle.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-03-25 04:14:09 +08:00
|
|
|
memcpy(&value, ptr, sizeof(u64));
|
2012-09-27 00:13:04 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-05-31 23:47:46 +08:00
|
|
|
if (!needs_swap)
|
2012-09-27 00:13:04 +08:00
|
|
|
return value;
|
|
|
|
|
|
|
|
switch (field->size) {
|
|
|
|
case 2:
|
|
|
|
return bswap_16(value);
|
|
|
|
case 4:
|
|
|
|
return bswap_32(value);
|
|
|
|
case 8:
|
|
|
|
return bswap_64(value);
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
2012-09-12 06:24:23 +08:00
|
|
|
}
|
2012-12-11 05:17:08 +08:00
|
|
|
|
2020-04-30 03:26:57 +08:00
|
|
|
u64 evsel__intval(struct evsel *evsel, struct perf_sample *sample, const char *name)
|
2016-05-31 23:47:46 +08:00
|
|
|
{
|
2020-04-30 03:26:57 +08:00
|
|
|
struct tep_format_field *field = evsel__field(evsel, name);
|
2016-05-31 23:47:46 +08:00
|
|
|
|
|
|
|
if (!field)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return field ? format_field__intval(field, sample, evsel->needs_swap) : 0;
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
bool perf_evsel__fallback(struct evsel *evsel, int err,
|
2012-12-14 01:16:30 +08:00
|
|
|
char *msg, size_t msgsize)
|
|
|
|
{
|
perf evsel: Handle EACCESS + perf_event_paranoid=2 in fallback()
Now with the default for the kernel.perf_event_paranoid sysctl being 2 [1]
we need to fall back to :u, i.e. to set perf_event_attr.exclude_kernel
to 1.
Before:
[acme@jouet linux]$ perf record usleep 1
Error:
You may not have permission to collect stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
which controls use of the performance events system by
unprivileged users (without CAP_SYS_ADMIN).
The current value is 2:
-1: Allow use of (almost) all events by all users
>= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
>= 1: Disallow CPU event access by users without CAP_SYS_ADMIN
>= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN
[acme@jouet linux]$
After:
[acme@jouet linux]$ perf record usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.016 MB perf.data (7 samples) ]
[acme@jouet linux]$ perf evlist
cycles:u
[acme@jouet linux]$ perf evlist -v
cycles:u: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, disabled: 1, inherit: 1, exclude_kernel: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
[acme@jouet linux]$
And if the user turns on verbose mode, an explanation will appear:
[acme@jouet linux]$ perf record -v usleep 1
Warning:
kernel.perf_event_paranoid=2, trying to fall back to excluding kernel samples
mmap size 528384B
[ perf record: Woken up 1 times to write data ]
Looking at the vmlinux_path (8 entries long)
Using /lib/modules/4.6.0-rc7+/build/vmlinux for symbols
[ perf record: Captured and wrote 0.016 MB perf.data (7 samples) ]
[acme@jouet linux]$
[1] 0161028b7c8a ("perf/core: Change the default paranoia level to 2")
Reported-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-b20jmx4dxt5hpaa9t2rroi0o@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-05-13 03:07:47 +08:00
|
|
|
int paranoid;
|
|
|
|
|
2013-07-19 07:27:59 +08:00
|
|
|
if ((err == ENOENT || err == ENXIO || err == ENODEV) &&
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.type == PERF_TYPE_HARDWARE &&
|
|
|
|
evsel->core.attr.config == PERF_COUNT_HW_CPU_CYCLES) {
|
2012-12-14 01:16:30 +08:00
|
|
|
/*
|
|
|
|
* If it's cycles then fall back to hrtimer based
|
|
|
|
* cpu-clock-tick sw counter, which is always available even if
|
|
|
|
* no PMU support.
|
|
|
|
*
|
|
|
|
* PPC returns ENXIO until 2.6.37 (behavior changed with commit
|
|
|
|
* b0a873e).
|
|
|
|
*/
|
|
|
|
scnprintf(msg, msgsize, "%s",
|
|
|
|
"The cycles event is not supported, trying to fall back to cpu-clock-ticks");
|
|
|
|
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.type = PERF_TYPE_SOFTWARE;
|
|
|
|
evsel->core.attr.config = PERF_COUNT_SW_CPU_CLOCK;
|
2012-12-14 01:16:30 +08:00
|
|
|
|
2013-12-27 04:41:15 +08:00
|
|
|
zfree(&evsel->name);
|
perf evsel: Handle EACCESS + perf_event_paranoid=2 in fallback()
Now with the default for the kernel.perf_event_paranoid sysctl being 2 [1]
we need to fall back to :u, i.e. to set perf_event_attr.exclude_kernel
to 1.
Before:
[acme@jouet linux]$ perf record usleep 1
Error:
You may not have permission to collect stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
which controls use of the performance events system by
unprivileged users (without CAP_SYS_ADMIN).
The current value is 2:
-1: Allow use of (almost) all events by all users
>= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
>= 1: Disallow CPU event access by users without CAP_SYS_ADMIN
>= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN
[acme@jouet linux]$
After:
[acme@jouet linux]$ perf record usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.016 MB perf.data (7 samples) ]
[acme@jouet linux]$ perf evlist
cycles:u
[acme@jouet linux]$ perf evlist -v
cycles:u: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, disabled: 1, inherit: 1, exclude_kernel: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
[acme@jouet linux]$
And if the user turns on verbose mode, an explanation will appear:
[acme@jouet linux]$ perf record -v usleep 1
Warning:
kernel.perf_event_paranoid=2, trying to fall back to excluding kernel samples
mmap size 528384B
[ perf record: Woken up 1 times to write data ]
Looking at the vmlinux_path (8 entries long)
Using /lib/modules/4.6.0-rc7+/build/vmlinux for symbols
[ perf record: Captured and wrote 0.016 MB perf.data (7 samples) ]
[acme@jouet linux]$
[1] 0161028b7c8a ("perf/core: Change the default paranoia level to 2")
Reported-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-b20jmx4dxt5hpaa9t2rroi0o@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-05-13 03:07:47 +08:00
|
|
|
return true;
|
2019-07-21 19:24:29 +08:00
|
|
|
} else if (err == EACCES && !evsel->core.attr.exclude_kernel &&
|
perf evsel: Handle EACCESS + perf_event_paranoid=2 in fallback()
Now with the default for the kernel.perf_event_paranoid sysctl being 2 [1]
we need to fall back to :u, i.e. to set perf_event_attr.exclude_kernel
to 1.
Before:
[acme@jouet linux]$ perf record usleep 1
Error:
You may not have permission to collect stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
which controls use of the performance events system by
unprivileged users (without CAP_SYS_ADMIN).
The current value is 2:
-1: Allow use of (almost) all events by all users
>= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
>= 1: Disallow CPU event access by users without CAP_SYS_ADMIN
>= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN
[acme@jouet linux]$
After:
[acme@jouet linux]$ perf record usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.016 MB perf.data (7 samples) ]
[acme@jouet linux]$ perf evlist
cycles:u
[acme@jouet linux]$ perf evlist -v
cycles:u: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, disabled: 1, inherit: 1, exclude_kernel: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
[acme@jouet linux]$
And if the user turns on verbose mode, an explanation will appear:
[acme@jouet linux]$ perf record -v usleep 1
Warning:
kernel.perf_event_paranoid=2, trying to fall back to excluding kernel samples
mmap size 528384B
[ perf record: Woken up 1 times to write data ]
Looking at the vmlinux_path (8 entries long)
Using /lib/modules/4.6.0-rc7+/build/vmlinux for symbols
[ perf record: Captured and wrote 0.016 MB perf.data (7 samples) ]
[acme@jouet linux]$
[1] 0161028b7c8a ("perf/core: Change the default paranoia level to 2")
Reported-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-b20jmx4dxt5hpaa9t2rroi0o@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-05-13 03:07:47 +08:00
|
|
|
(paranoid = perf_event_paranoid()) > 1) {
|
2020-04-30 03:07:09 +08:00
|
|
|
const char *name = evsel__name(evsel);
|
perf evsel: Handle EACCESS + perf_event_paranoid=2 in fallback()
Now with the default for the kernel.perf_event_paranoid sysctl being 2 [1]
we need to fall back to :u, i.e. to set perf_event_attr.exclude_kernel
to 1.
Before:
[acme@jouet linux]$ perf record usleep 1
Error:
You may not have permission to collect stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
which controls use of the performance events system by
unprivileged users (without CAP_SYS_ADMIN).
The current value is 2:
-1: Allow use of (almost) all events by all users
>= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
>= 1: Disallow CPU event access by users without CAP_SYS_ADMIN
>= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN
[acme@jouet linux]$
After:
[acme@jouet linux]$ perf record usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.016 MB perf.data (7 samples) ]
[acme@jouet linux]$ perf evlist
cycles:u
[acme@jouet linux]$ perf evlist -v
cycles:u: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, disabled: 1, inherit: 1, exclude_kernel: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
[acme@jouet linux]$
And if the user turns on verbose mode, an explanation will appear:
[acme@jouet linux]$ perf record -v usleep 1
Warning:
kernel.perf_event_paranoid=2, trying to fall back to excluding kernel samples
mmap size 528384B
[ perf record: Woken up 1 times to write data ]
Looking at the vmlinux_path (8 entries long)
Using /lib/modules/4.6.0-rc7+/build/vmlinux for symbols
[ perf record: Captured and wrote 0.016 MB perf.data (7 samples) ]
[acme@jouet linux]$
[1] 0161028b7c8a ("perf/core: Change the default paranoia level to 2")
Reported-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-b20jmx4dxt5hpaa9t2rroi0o@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-05-13 03:07:47 +08:00
|
|
|
char *new_name;
|
2018-04-23 17:08:17 +08:00
|
|
|
const char *sep = ":";
|
perf evsel: Handle EACCESS + perf_event_paranoid=2 in fallback()
Now with the default for the kernel.perf_event_paranoid sysctl being 2 [1]
we need to fall back to :u, i.e. to set perf_event_attr.exclude_kernel
to 1.
Before:
[acme@jouet linux]$ perf record usleep 1
Error:
You may not have permission to collect stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
which controls use of the performance events system by
unprivileged users (without CAP_SYS_ADMIN).
The current value is 2:
-1: Allow use of (almost) all events by all users
>= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
>= 1: Disallow CPU event access by users without CAP_SYS_ADMIN
>= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN
[acme@jouet linux]$
After:
[acme@jouet linux]$ perf record usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.016 MB perf.data (7 samples) ]
[acme@jouet linux]$ perf evlist
cycles:u
[acme@jouet linux]$ perf evlist -v
cycles:u: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, disabled: 1, inherit: 1, exclude_kernel: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
[acme@jouet linux]$
And if the user turns on verbose mode, an explanation will appear:
[acme@jouet linux]$ perf record -v usleep 1
Warning:
kernel.perf_event_paranoid=2, trying to fall back to excluding kernel samples
mmap size 528384B
[ perf record: Woken up 1 times to write data ]
Looking at the vmlinux_path (8 entries long)
Using /lib/modules/4.6.0-rc7+/build/vmlinux for symbols
[ perf record: Captured and wrote 0.016 MB perf.data (7 samples) ]
[acme@jouet linux]$
[1] 0161028b7c8a ("perf/core: Change the default paranoia level to 2")
Reported-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-b20jmx4dxt5hpaa9t2rroi0o@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-05-13 03:07:47 +08:00
|
|
|
|
2020-04-15 00:15:50 +08:00
|
|
|
/* If event has exclude user then don't exclude kernel. */
|
|
|
|
if (evsel->core.attr.exclude_user)
|
|
|
|
return false;
|
|
|
|
|
2018-04-23 17:08:17 +08:00
|
|
|
/* Is there already the separator in the name. */
|
|
|
|
if (strchr(name, '/') ||
|
|
|
|
strchr(name, ':'))
|
|
|
|
sep = "";
|
|
|
|
|
|
|
|
if (asprintf(&new_name, "%s%su", name, sep) < 0)
|
perf evsel: Handle EACCESS + perf_event_paranoid=2 in fallback()
Now with the default for the kernel.perf_event_paranoid sysctl being 2 [1]
we need to fall back to :u, i.e. to set perf_event_attr.exclude_kernel
to 1.
Before:
[acme@jouet linux]$ perf record usleep 1
Error:
You may not have permission to collect stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
which controls use of the performance events system by
unprivileged users (without CAP_SYS_ADMIN).
The current value is 2:
-1: Allow use of (almost) all events by all users
>= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
>= 1: Disallow CPU event access by users without CAP_SYS_ADMIN
>= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN
[acme@jouet linux]$
After:
[acme@jouet linux]$ perf record usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.016 MB perf.data (7 samples) ]
[acme@jouet linux]$ perf evlist
cycles:u
[acme@jouet linux]$ perf evlist -v
cycles:u: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, disabled: 1, inherit: 1, exclude_kernel: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
[acme@jouet linux]$
And if the user turns on verbose mode, an explanation will appear:
[acme@jouet linux]$ perf record -v usleep 1
Warning:
kernel.perf_event_paranoid=2, trying to fall back to excluding kernel samples
mmap size 528384B
[ perf record: Woken up 1 times to write data ]
Looking at the vmlinux_path (8 entries long)
Using /lib/modules/4.6.0-rc7+/build/vmlinux for symbols
[ perf record: Captured and wrote 0.016 MB perf.data (7 samples) ]
[acme@jouet linux]$
[1] 0161028b7c8a ("perf/core: Change the default paranoia level to 2")
Reported-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-b20jmx4dxt5hpaa9t2rroi0o@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-05-13 03:07:47 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
if (evsel->name)
|
|
|
|
free(evsel->name);
|
|
|
|
evsel->name = new_name;
|
perf record: Fix priv level with branch sampling for paranoid=2
Now that the default perf_events paranoid level is set to 2, a regular
user cannot monitor kernel level activity anymore. As such, with the
following cmdline:
$ perf record -e cycles date
The perf tool first tries cycles:uk but then falls back to cycles:u as
can be seen in the perf report --header-only output:
cmdline : /export/hda3/tmp/perf.tip record -e cycles ls
event : name = cycles:u, , id = { 436186, ... }
This is okay as long as there is way to learn the priv level was changed
internally by the tool.
But consider a similar example:
$ perf record -b -e cycles date
Error:
You may not have permission to collect stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
which controls use of the performance events system by
unprivileged users (without CAP_SYS_ADMIN).
...
Why is that treated differently given that the branch sampling inherits the
priv level of the first event in this case, i.e., cycles:u? It turns out
that the branch sampling code is more picky and also checks exclude_hv.
In the fallback path, perf record is setting exclude_kernel = 1, but it
does not change exclude_hv. This does not seem to match the restriction
imposed by paranoid = 2.
This patch fixes the problem by forcing exclude_hv = 1 in the fallback
for paranoid=2. With this in place:
$ perf record -b -e cycles date
cmdline : /export/hda3/tmp/perf.tip record -b -e cycles ls
event : name = cycles:u, , id = { 436847, ... }
And the command succeeds as expected.
V2 fix a white space.
Committer testing:
After aplying the patch we get:
[acme@quaco ~]$ perf record -b -e cycles date
WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,
check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.
Samples in kernel functions may not be resolved if a suitable vmlinux
file is not found in the buildid cache or in the vmlinux path.
Samples in kernel modules won't be resolved at all.
If some relocation was applied (e.g. kexec) symbols may be misresolved
even with a suitable vmlinux or kallsyms file.
Mon 23 Sep 2019 11:00:59 AM -03
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.005 MB perf.data (14 samples) ]
[acme@quaco ~]$ perf evlist -v
cycles:u: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD|BRANCH_STACK, read_format: ID, disabled: 1, inherit: 1, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, branch_sample_type: ANY
[acme@quaco ~]$
That warning about restricted kernel maps will be suppressed in a follow
up patch, as perf_event_attr.exclude_kernel is set, i.e. no samples for
the kernel will be taken and thus no need for those maps.
Signed-off-by: Stephane Eranian <eranian@google.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20190920230356.41420-1-eranian@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2019-09-21 07:03:56 +08:00
|
|
|
scnprintf(msg, msgsize, "kernel.perf_event_paranoid=%d, trying "
|
|
|
|
"to fall back to excluding kernel and hypervisor "
|
|
|
|
" samples", paranoid);
|
2019-07-21 19:24:29 +08:00
|
|
|
evsel->core.attr.exclude_kernel = 1;
|
perf record: Fix priv level with branch sampling for paranoid=2
Now that the default perf_events paranoid level is set to 2, a regular
user cannot monitor kernel level activity anymore. As such, with the
following cmdline:
$ perf record -e cycles date
The perf tool first tries cycles:uk but then falls back to cycles:u as
can be seen in the perf report --header-only output:
cmdline : /export/hda3/tmp/perf.tip record -e cycles ls
event : name = cycles:u, , id = { 436186, ... }
This is okay as long as there is way to learn the priv level was changed
internally by the tool.
But consider a similar example:
$ perf record -b -e cycles date
Error:
You may not have permission to collect stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
which controls use of the performance events system by
unprivileged users (without CAP_SYS_ADMIN).
...
Why is that treated differently given that the branch sampling inherits the
priv level of the first event in this case, i.e., cycles:u? It turns out
that the branch sampling code is more picky and also checks exclude_hv.
In the fallback path, perf record is setting exclude_kernel = 1, but it
does not change exclude_hv. This does not seem to match the restriction
imposed by paranoid = 2.
This patch fixes the problem by forcing exclude_hv = 1 in the fallback
for paranoid=2. With this in place:
$ perf record -b -e cycles date
cmdline : /export/hda3/tmp/perf.tip record -b -e cycles ls
event : name = cycles:u, , id = { 436847, ... }
And the command succeeds as expected.
V2 fix a white space.
Committer testing:
After aplying the patch we get:
[acme@quaco ~]$ perf record -b -e cycles date
WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,
check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.
Samples in kernel functions may not be resolved if a suitable vmlinux
file is not found in the buildid cache or in the vmlinux path.
Samples in kernel modules won't be resolved at all.
If some relocation was applied (e.g. kexec) symbols may be misresolved
even with a suitable vmlinux or kallsyms file.
Mon 23 Sep 2019 11:00:59 AM -03
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.005 MB perf.data (14 samples) ]
[acme@quaco ~]$ perf evlist -v
cycles:u: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD|BRANCH_STACK, read_format: ID, disabled: 1, inherit: 1, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, branch_sample_type: ANY
[acme@quaco ~]$
That warning about restricted kernel maps will be suppressed in a follow
up patch, as perf_event_attr.exclude_kernel is set, i.e. no samples for
the kernel will be taken and thus no need for those maps.
Signed-off-by: Stephane Eranian <eranian@google.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20190920230356.41420-1-eranian@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2019-09-21 07:03:56 +08:00
|
|
|
evsel->core.attr.exclude_hv = 1;
|
perf evsel: Handle EACCESS + perf_event_paranoid=2 in fallback()
Now with the default for the kernel.perf_event_paranoid sysctl being 2 [1]
we need to fall back to :u, i.e. to set perf_event_attr.exclude_kernel
to 1.
Before:
[acme@jouet linux]$ perf record usleep 1
Error:
You may not have permission to collect stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
which controls use of the performance events system by
unprivileged users (without CAP_SYS_ADMIN).
The current value is 2:
-1: Allow use of (almost) all events by all users
>= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
>= 1: Disallow CPU event access by users without CAP_SYS_ADMIN
>= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN
[acme@jouet linux]$
After:
[acme@jouet linux]$ perf record usleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.016 MB perf.data (7 samples) ]
[acme@jouet linux]$ perf evlist
cycles:u
[acme@jouet linux]$ perf evlist -v
cycles:u: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, disabled: 1, inherit: 1, exclude_kernel: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
[acme@jouet linux]$
And if the user turns on verbose mode, an explanation will appear:
[acme@jouet linux]$ perf record -v usleep 1
Warning:
kernel.perf_event_paranoid=2, trying to fall back to excluding kernel samples
mmap size 528384B
[ perf record: Woken up 1 times to write data ]
Looking at the vmlinux_path (8 entries long)
Using /lib/modules/4.6.0-rc7+/build/vmlinux for symbols
[ perf record: Captured and wrote 0.016 MB perf.data (7 samples) ]
[acme@jouet linux]$
[1] 0161028b7c8a ("perf/core: Change the default paranoia level to 2")
Reported-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-b20jmx4dxt5hpaa9t2rroi0o@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-05-13 03:07:47 +08:00
|
|
|
|
2012-12-14 01:16:30 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
2012-12-14 02:10:58 +08:00
|
|
|
|
2017-06-20 23:05:38 +08:00
|
|
|
static bool find_process(const char *name)
|
|
|
|
{
|
|
|
|
size_t len = strlen(name);
|
|
|
|
DIR *dir;
|
|
|
|
struct dirent *d;
|
|
|
|
int ret = -1;
|
|
|
|
|
|
|
|
dir = opendir(procfs__mountpoint());
|
|
|
|
if (!dir)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* Walk through the directory. */
|
|
|
|
while (ret && (d = readdir(dir)) != NULL) {
|
|
|
|
char path[PATH_MAX];
|
|
|
|
char *data;
|
|
|
|
size_t size;
|
|
|
|
|
|
|
|
if ((d->d_type != DT_DIR) ||
|
|
|
|
!strcmp(".", d->d_name) ||
|
|
|
|
!strcmp("..", d->d_name))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
scnprintf(path, sizeof(path), "%s/%s/comm",
|
|
|
|
procfs__mountpoint(), d->d_name);
|
|
|
|
|
|
|
|
if (filename__read_str(path, &data, &size))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ret = strncmp(name, data, len);
|
|
|
|
free(data);
|
|
|
|
}
|
|
|
|
|
|
|
|
closedir(dir);
|
|
|
|
return ret ? false : true;
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
int perf_evsel__open_strerror(struct evsel *evsel, struct target *target,
|
2012-12-14 02:10:58 +08:00
|
|
|
int err, char *msg, size_t size)
|
|
|
|
{
|
2014-08-14 10:22:36 +08:00
|
|
|
char sbuf[STRERR_BUFSIZE];
|
perf evsel: Return exact sub event which failed with EPERM for wildcards
The kernel has a special check for a specific irq_vectors trace event.
TRACE_EVENT_PERF_PERM(irq_work_exit,
is_sampling_event(p_event) ? -EPERM : 0);
The perf-record fails for this irq_vectors event when it is present,
like when using a wildcard:
root@skl:/tmp# perf record -a -e irq_vectors:* sleep 2
Error:
You may not have permission to collect system-wide stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
which controls use of the performance events system by
unprivileged users (without CAP_SYS_ADMIN).
The current value is 2:
-1: Allow use of (almost) all events by all users
>= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
>= 1: Disallow CPU event access by users without CAP_SYS_ADMIN
>= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN
To make this setting permanent, edit /etc/sysctl.conf too, e.g.:
kernel.perf_event_paranoid = -1
This patch prints out the exact sub event that failed with EPERM for
wildcards to help in understanding what went wrong when this event is
present:
After the patch:
root@skl:/tmp# perf record -a -e irq_vectors:* sleep 2
Error:
No permission to enable irq_vectors:irq_work_exit event.
You may not have permission to collect system-wide stats.
......
Committer notes:
So we have a lot of irq_vectors events:
[root@jouet ~]# perf list irq_vectors:*
List of pre-defined events (to be used in -e):
irq_vectors:call_function_entry [Tracepoint event]
irq_vectors:call_function_exit [Tracepoint event]
irq_vectors:call_function_single_entry [Tracepoint event]
irq_vectors:call_function_single_exit [Tracepoint event]
irq_vectors:deferred_error_apic_entry [Tracepoint event]
irq_vectors:deferred_error_apic_exit [Tracepoint event]
irq_vectors:error_apic_entry [Tracepoint event]
irq_vectors:error_apic_exit [Tracepoint event]
irq_vectors:irq_work_entry [Tracepoint event]
irq_vectors:irq_work_exit [Tracepoint event]
irq_vectors:local_timer_entry [Tracepoint event]
irq_vectors:local_timer_exit [Tracepoint event]
irq_vectors:reschedule_entry [Tracepoint event]
irq_vectors:reschedule_exit [Tracepoint event]
irq_vectors:spurious_apic_entry [Tracepoint event]
irq_vectors:spurious_apic_exit [Tracepoint event]
irq_vectors:thermal_apic_entry [Tracepoint event]
irq_vectors:thermal_apic_exit [Tracepoint event]
irq_vectors:threshold_apic_entry [Tracepoint event]
irq_vectors:threshold_apic_exit [Tracepoint event]
irq_vectors:x86_platform_ipi_entry [Tracepoint event]
irq_vectors:x86_platform_ipi_exit [Tracepoint event]
#
And some may be sampled:
[root@jouet ~]# perf record -e irq_vectors:local* sleep 20s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.020 MB perf.data (2 samples) ]
[root@jouet ~]# perf report -D | egrep 'stats:|events:'
Aggregated stats:
TOTAL events: 155
MMAP events: 144
COMM events: 2
EXIT events: 1
SAMPLE events: 2
MMAP2 events: 4
FINISHED_ROUND events: 1
TIME_CONV events: 1
irq_vectors:local_timer_entry stats:
TOTAL events: 1
SAMPLE events: 1
irq_vectors:local_timer_exit stats:
TOTAL events: 1
SAMPLE events: 1
[root@jouet ~]#
But, as shown in the tracepoint definition at the start of this message,
some, like "irq_vectors:irq_work_exit", may not be sampled, just counted,
i.e. if we try to sample, as when using 'perf record', we get an error:
[root@jouet ~]# perf record -e irq_vectors:irq_work_exit
Error:
You may not have permission to collect system-wide stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
<SNIP>
The error message is misleading, this patch will help in pointing out
what is the event causing such an error, but the error message needs
improvement, i.e. we need to figure out a way to check if a tracepoint
is counting only, like this one, when all we can do is to count it with
'perf stat', at most printing the delta using interval printing, as in:
[root@jouet ~]# perf stat -I 5000 -e irq_vectors:irq_work_*
# time counts unit events
5.000168871 0 irq_vectors:irq_work_entry
5.000168871 0 irq_vectors:irq_work_exit
10.000676730 0 irq_vectors:irq_work_entry
10.000676730 0 irq_vectors:irq_work_exit
15.001122415 0 irq_vectors:irq_work_entry
15.001122415 0 irq_vectors:irq_work_exit
20.001298051 0 irq_vectors:irq_work_entry
20.001298051 0 irq_vectors:irq_work_exit
25.001485020 1 irq_vectors:irq_work_entry
25.001485020 1 irq_vectors:irq_work_exit
30.001658706 0 irq_vectors:irq_work_entry
30.001658706 0 irq_vectors:irq_work_exit
^C 32.045711878 0 irq_vectors:irq_work_entry
32.045711878 0 irq_vectors:irq_work_exit
[root@jouet ~]#
But at least, when we use a wildcard, this patch helps a bit.
Signed-off-by: Yao Jin <yao.jin@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1491566932-503-1-git-send-email-yao.jin@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-04-07 20:08:52 +08:00
|
|
|
int printed = 0;
|
2014-08-14 10:22:36 +08:00
|
|
|
|
2012-12-14 02:10:58 +08:00
|
|
|
switch (err) {
|
|
|
|
case EPERM:
|
|
|
|
case EACCES:
|
perf evsel: Return exact sub event which failed with EPERM for wildcards
The kernel has a special check for a specific irq_vectors trace event.
TRACE_EVENT_PERF_PERM(irq_work_exit,
is_sampling_event(p_event) ? -EPERM : 0);
The perf-record fails for this irq_vectors event when it is present,
like when using a wildcard:
root@skl:/tmp# perf record -a -e irq_vectors:* sleep 2
Error:
You may not have permission to collect system-wide stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
which controls use of the performance events system by
unprivileged users (without CAP_SYS_ADMIN).
The current value is 2:
-1: Allow use of (almost) all events by all users
>= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
>= 1: Disallow CPU event access by users without CAP_SYS_ADMIN
>= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN
To make this setting permanent, edit /etc/sysctl.conf too, e.g.:
kernel.perf_event_paranoid = -1
This patch prints out the exact sub event that failed with EPERM for
wildcards to help in understanding what went wrong when this event is
present:
After the patch:
root@skl:/tmp# perf record -a -e irq_vectors:* sleep 2
Error:
No permission to enable irq_vectors:irq_work_exit event.
You may not have permission to collect system-wide stats.
......
Committer notes:
So we have a lot of irq_vectors events:
[root@jouet ~]# perf list irq_vectors:*
List of pre-defined events (to be used in -e):
irq_vectors:call_function_entry [Tracepoint event]
irq_vectors:call_function_exit [Tracepoint event]
irq_vectors:call_function_single_entry [Tracepoint event]
irq_vectors:call_function_single_exit [Tracepoint event]
irq_vectors:deferred_error_apic_entry [Tracepoint event]
irq_vectors:deferred_error_apic_exit [Tracepoint event]
irq_vectors:error_apic_entry [Tracepoint event]
irq_vectors:error_apic_exit [Tracepoint event]
irq_vectors:irq_work_entry [Tracepoint event]
irq_vectors:irq_work_exit [Tracepoint event]
irq_vectors:local_timer_entry [Tracepoint event]
irq_vectors:local_timer_exit [Tracepoint event]
irq_vectors:reschedule_entry [Tracepoint event]
irq_vectors:reschedule_exit [Tracepoint event]
irq_vectors:spurious_apic_entry [Tracepoint event]
irq_vectors:spurious_apic_exit [Tracepoint event]
irq_vectors:thermal_apic_entry [Tracepoint event]
irq_vectors:thermal_apic_exit [Tracepoint event]
irq_vectors:threshold_apic_entry [Tracepoint event]
irq_vectors:threshold_apic_exit [Tracepoint event]
irq_vectors:x86_platform_ipi_entry [Tracepoint event]
irq_vectors:x86_platform_ipi_exit [Tracepoint event]
#
And some may be sampled:
[root@jouet ~]# perf record -e irq_vectors:local* sleep 20s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.020 MB perf.data (2 samples) ]
[root@jouet ~]# perf report -D | egrep 'stats:|events:'
Aggregated stats:
TOTAL events: 155
MMAP events: 144
COMM events: 2
EXIT events: 1
SAMPLE events: 2
MMAP2 events: 4
FINISHED_ROUND events: 1
TIME_CONV events: 1
irq_vectors:local_timer_entry stats:
TOTAL events: 1
SAMPLE events: 1
irq_vectors:local_timer_exit stats:
TOTAL events: 1
SAMPLE events: 1
[root@jouet ~]#
But, as shown in the tracepoint definition at the start of this message,
some, like "irq_vectors:irq_work_exit", may not be sampled, just counted,
i.e. if we try to sample, as when using 'perf record', we get an error:
[root@jouet ~]# perf record -e irq_vectors:irq_work_exit
Error:
You may not have permission to collect system-wide stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
<SNIP>
The error message is misleading, this patch will help in pointing out
what is the event causing such an error, but the error message needs
improvement, i.e. we need to figure out a way to check if a tracepoint
is counting only, like this one, when all we can do is to count it with
'perf stat', at most printing the delta using interval printing, as in:
[root@jouet ~]# perf stat -I 5000 -e irq_vectors:irq_work_*
# time counts unit events
5.000168871 0 irq_vectors:irq_work_entry
5.000168871 0 irq_vectors:irq_work_exit
10.000676730 0 irq_vectors:irq_work_entry
10.000676730 0 irq_vectors:irq_work_exit
15.001122415 0 irq_vectors:irq_work_entry
15.001122415 0 irq_vectors:irq_work_exit
20.001298051 0 irq_vectors:irq_work_entry
20.001298051 0 irq_vectors:irq_work_exit
25.001485020 1 irq_vectors:irq_work_entry
25.001485020 1 irq_vectors:irq_work_exit
30.001658706 0 irq_vectors:irq_work_entry
30.001658706 0 irq_vectors:irq_work_exit
^C 32.045711878 0 irq_vectors:irq_work_entry
32.045711878 0 irq_vectors:irq_work_exit
[root@jouet ~]#
But at least, when we use a wildcard, this patch helps a bit.
Signed-off-by: Yao Jin <yao.jin@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1491566932-503-1-git-send-email-yao.jin@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-04-07 20:08:52 +08:00
|
|
|
if (err == EPERM)
|
|
|
|
printed = scnprintf(msg, size,
|
2020-04-30 03:07:09 +08:00
|
|
|
"No permission to enable %s event.\n\n", evsel__name(evsel));
|
perf evsel: Return exact sub event which failed with EPERM for wildcards
The kernel has a special check for a specific irq_vectors trace event.
TRACE_EVENT_PERF_PERM(irq_work_exit,
is_sampling_event(p_event) ? -EPERM : 0);
The perf-record fails for this irq_vectors event when it is present,
like when using a wildcard:
root@skl:/tmp# perf record -a -e irq_vectors:* sleep 2
Error:
You may not have permission to collect system-wide stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
which controls use of the performance events system by
unprivileged users (without CAP_SYS_ADMIN).
The current value is 2:
-1: Allow use of (almost) all events by all users
>= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
>= 1: Disallow CPU event access by users without CAP_SYS_ADMIN
>= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN
To make this setting permanent, edit /etc/sysctl.conf too, e.g.:
kernel.perf_event_paranoid = -1
This patch prints out the exact sub event that failed with EPERM for
wildcards to help in understanding what went wrong when this event is
present:
After the patch:
root@skl:/tmp# perf record -a -e irq_vectors:* sleep 2
Error:
No permission to enable irq_vectors:irq_work_exit event.
You may not have permission to collect system-wide stats.
......
Committer notes:
So we have a lot of irq_vectors events:
[root@jouet ~]# perf list irq_vectors:*
List of pre-defined events (to be used in -e):
irq_vectors:call_function_entry [Tracepoint event]
irq_vectors:call_function_exit [Tracepoint event]
irq_vectors:call_function_single_entry [Tracepoint event]
irq_vectors:call_function_single_exit [Tracepoint event]
irq_vectors:deferred_error_apic_entry [Tracepoint event]
irq_vectors:deferred_error_apic_exit [Tracepoint event]
irq_vectors:error_apic_entry [Tracepoint event]
irq_vectors:error_apic_exit [Tracepoint event]
irq_vectors:irq_work_entry [Tracepoint event]
irq_vectors:irq_work_exit [Tracepoint event]
irq_vectors:local_timer_entry [Tracepoint event]
irq_vectors:local_timer_exit [Tracepoint event]
irq_vectors:reschedule_entry [Tracepoint event]
irq_vectors:reschedule_exit [Tracepoint event]
irq_vectors:spurious_apic_entry [Tracepoint event]
irq_vectors:spurious_apic_exit [Tracepoint event]
irq_vectors:thermal_apic_entry [Tracepoint event]
irq_vectors:thermal_apic_exit [Tracepoint event]
irq_vectors:threshold_apic_entry [Tracepoint event]
irq_vectors:threshold_apic_exit [Tracepoint event]
irq_vectors:x86_platform_ipi_entry [Tracepoint event]
irq_vectors:x86_platform_ipi_exit [Tracepoint event]
#
And some may be sampled:
[root@jouet ~]# perf record -e irq_vectors:local* sleep 20s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.020 MB perf.data (2 samples) ]
[root@jouet ~]# perf report -D | egrep 'stats:|events:'
Aggregated stats:
TOTAL events: 155
MMAP events: 144
COMM events: 2
EXIT events: 1
SAMPLE events: 2
MMAP2 events: 4
FINISHED_ROUND events: 1
TIME_CONV events: 1
irq_vectors:local_timer_entry stats:
TOTAL events: 1
SAMPLE events: 1
irq_vectors:local_timer_exit stats:
TOTAL events: 1
SAMPLE events: 1
[root@jouet ~]#
But, as shown in the tracepoint definition at the start of this message,
some, like "irq_vectors:irq_work_exit", may not be sampled, just counted,
i.e. if we try to sample, as when using 'perf record', we get an error:
[root@jouet ~]# perf record -e irq_vectors:irq_work_exit
Error:
You may not have permission to collect system-wide stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid,
<SNIP>
The error message is misleading, this patch will help in pointing out
what is the event causing such an error, but the error message needs
improvement, i.e. we need to figure out a way to check if a tracepoint
is counting only, like this one, when all we can do is to count it with
'perf stat', at most printing the delta using interval printing, as in:
[root@jouet ~]# perf stat -I 5000 -e irq_vectors:irq_work_*
# time counts unit events
5.000168871 0 irq_vectors:irq_work_entry
5.000168871 0 irq_vectors:irq_work_exit
10.000676730 0 irq_vectors:irq_work_entry
10.000676730 0 irq_vectors:irq_work_exit
15.001122415 0 irq_vectors:irq_work_entry
15.001122415 0 irq_vectors:irq_work_exit
20.001298051 0 irq_vectors:irq_work_entry
20.001298051 0 irq_vectors:irq_work_exit
25.001485020 1 irq_vectors:irq_work_entry
25.001485020 1 irq_vectors:irq_work_exit
30.001658706 0 irq_vectors:irq_work_entry
30.001658706 0 irq_vectors:irq_work_exit
^C 32.045711878 0 irq_vectors:irq_work_entry
32.045711878 0 irq_vectors:irq_work_exit
[root@jouet ~]#
But at least, when we use a wildcard, this patch helps a bit.
Signed-off-by: Yao Jin <yao.jin@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1491566932-503-1-git-send-email-yao.jin@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-04-07 20:08:52 +08:00
|
|
|
|
|
|
|
return scnprintf(msg + printed, size - printed,
|
2016-01-20 05:35:15 +08:00
|
|
|
"You may not have permission to collect %sstats.\n\n"
|
|
|
|
"Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n"
|
|
|
|
"which controls use of the performance events system by\n"
|
perf tools: Support CAP_PERFMON capability
Extend error messages to mention CAP_PERFMON capability as an option to
substitute CAP_SYS_ADMIN capability for secure system performance
monitoring and observability operations. Make
perf_event_paranoid_check() and __cmd_ftrace() to be aware of
CAP_PERFMON capability.
CAP_PERFMON implements the principle of least privilege for performance
monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39
principle of least privilege: A security design principle that states
that a process or program be granted only those privileges (e.g.,
capabilities) necessary to accomplish its legitimate function, and only
for the time that such privileges are actually required)
For backward compatibility reasons access to perf_events subsystem remains
open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for
secure perf_events monitoring is discouraged with respect to CAP_PERFMON
capability.
Committer testing:
Using a libcap with this patch:
diff --git a/libcap/include/uapi/linux/capability.h b/libcap/include/uapi/linux/capability.h
index 78b2fd4c8a95..89b5b0279b60 100644
--- a/libcap/include/uapi/linux/capability.h
+++ b/libcap/include/uapi/linux/capability.h
@@ -366,8 +366,9 @@ struct vfs_ns_cap_data {
#define CAP_AUDIT_READ 37
+#define CAP_PERFMON 38
-#define CAP_LAST_CAP CAP_AUDIT_READ
+#define CAP_LAST_CAP CAP_PERFMON
#define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
Note that using '38' in place of 'cap_perfmon' works to some degree with
an old libcap, its only when cap_get_flag() is called that libcap
performs an error check based on the maximum value known for
capabilities that it will fail.
This makes determining the default of perf_event_attr.exclude_kernel to
fail, as it can't determine if CAP_PERFMON is in place.
Using 'perf top -e cycles' avoids the default check and sets
perf_event_attr.exclude_kernel to 1.
As root, with a libcap supporting CAP_PERFMON:
# groupadd perf_users
# adduser perf -g perf_users
# mkdir ~perf/bin
# cp ~acme/bin/perf ~perf/bin/
# chgrp perf_users ~perf/bin/perf
# setcap "cap_perfmon,cap_sys_ptrace,cap_syslog=ep" ~perf/bin/perf
# getcap ~perf/bin/perf
/home/perf/bin/perf = cap_sys_ptrace,cap_syslog,cap_perfmon+ep
# ls -la ~perf/bin/perf
-rwxr-xr-x. 1 root perf_users 16968552 Apr 9 13:10 /home/perf/bin/perf
As the 'perf' user in the 'perf_users' group:
$ perf top -a --stdio
Error:
Failed to mmap with 1 (Operation not permitted)
$
Either add the cap_ipc_lock capability to the perf binary or reduce the
ring buffer size to some smaller value:
$ perf top -m10 -a --stdio
rounding mmap pages size to 64K (16 pages)
Error:
Failed to mmap with 1 (Operation not permitted)
$ perf top -m4 -a --stdio
Error:
Failed to mmap with 1 (Operation not permitted)
$ perf top -m2 -a --stdio
PerfTop: 762 irqs/sec kernel:49.7% exact: 100.0% lost: 0/0 drop: 0/0 [4000Hz cycles], (all, 4 CPUs)
------------------------------------------------------------------------------------------------------
9.83% perf [.] __symbols__insert
8.58% perf [.] rb_next
5.91% [kernel] [k] module_get_kallsym
5.66% [kernel] [k] kallsyms_expand_symbol.constprop.0
3.98% libc-2.29.so [.] __GI_____strtoull_l_internal
3.66% perf [.] rb_insert_color
2.34% [kernel] [k] vsnprintf
2.30% [kernel] [k] string_nocheck
2.16% libc-2.29.so [.] _IO_getdelim
2.15% [kernel] [k] number
2.13% [kernel] [k] format_decode
1.58% libc-2.29.so [.] _IO_feof
1.52% libc-2.29.so [.] __strcmp_avx2
1.50% perf [.] rb_set_parent_color
1.47% libc-2.29.so [.] __libc_calloc
1.24% [kernel] [k] do_syscall_64
1.17% [kernel] [k] __x86_indirect_thunk_rax
$ perf record -a sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.552 MB perf.data (74 samples) ]
$ perf evlist
cycles
$ perf evlist -v
cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1
$ perf report | head -20
# To display the perf.data header info, please use --header/--header-only options.
#
#
# Total Lost Samples: 0
#
# Samples: 74 of event 'cycles'
# Event count (approx.): 15694834
#
# Overhead Command Shared Object Symbol
# ........ ............... .......................... ......................................
#
19.62% perf [kernel.vmlinux] [k] strnlen_user
13.88% swapper [kernel.vmlinux] [k] intel_idle
13.83% ksoftirqd/0 [kernel.vmlinux] [k] pfifo_fast_dequeue
13.51% swapper [kernel.vmlinux] [k] kmem_cache_free
6.31% gnome-shell [kernel.vmlinux] [k] kmem_cache_free
5.66% kworker/u8:3+ix [kernel.vmlinux] [k] delay_tsc
4.42% perf [kernel.vmlinux] [k] __set_cpus_allowed_ptr
3.45% kworker/2:1-eve [kernel.vmlinux] [k] shmem_truncate_range
2.29% gnome-shell libgobject-2.0.so.0.6000.7 [.] g_closure_ref
$
Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Igor Lubashev <ilubashe@akamai.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: intel-gfx@lists.freedesktop.org
Cc: linux-doc@vger.kernel.org
Cc: linux-man@vger.kernel.org
Cc: linux-security-module@vger.kernel.org
Cc: selinux@vger.kernel.org
Link: http://lore.kernel.org/lkml/a66d5648-2b8e-577e-e1f2-1d56c017ab5e@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2020-04-02 16:47:35 +08:00
|
|
|
"unprivileged users (without CAP_PERFMON or CAP_SYS_ADMIN).\n\n"
|
2016-05-13 02:44:55 +08:00
|
|
|
"The current value is %d:\n\n"
|
2016-01-20 05:35:15 +08:00
|
|
|
" -1: Allow use of (almost) all events by all users\n"
|
2017-08-20 19:39:20 +08:00
|
|
|
" Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK\n"
|
perf tools: Support CAP_PERFMON capability
Extend error messages to mention CAP_PERFMON capability as an option to
substitute CAP_SYS_ADMIN capability for secure system performance
monitoring and observability operations. Make
perf_event_paranoid_check() and __cmd_ftrace() to be aware of
CAP_PERFMON capability.
CAP_PERFMON implements the principle of least privilege for performance
monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39
principle of least privilege: A security design principle that states
that a process or program be granted only those privileges (e.g.,
capabilities) necessary to accomplish its legitimate function, and only
for the time that such privileges are actually required)
For backward compatibility reasons access to perf_events subsystem remains
open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for
secure perf_events monitoring is discouraged with respect to CAP_PERFMON
capability.
Committer testing:
Using a libcap with this patch:
diff --git a/libcap/include/uapi/linux/capability.h b/libcap/include/uapi/linux/capability.h
index 78b2fd4c8a95..89b5b0279b60 100644
--- a/libcap/include/uapi/linux/capability.h
+++ b/libcap/include/uapi/linux/capability.h
@@ -366,8 +366,9 @@ struct vfs_ns_cap_data {
#define CAP_AUDIT_READ 37
+#define CAP_PERFMON 38
-#define CAP_LAST_CAP CAP_AUDIT_READ
+#define CAP_LAST_CAP CAP_PERFMON
#define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
Note that using '38' in place of 'cap_perfmon' works to some degree with
an old libcap, its only when cap_get_flag() is called that libcap
performs an error check based on the maximum value known for
capabilities that it will fail.
This makes determining the default of perf_event_attr.exclude_kernel to
fail, as it can't determine if CAP_PERFMON is in place.
Using 'perf top -e cycles' avoids the default check and sets
perf_event_attr.exclude_kernel to 1.
As root, with a libcap supporting CAP_PERFMON:
# groupadd perf_users
# adduser perf -g perf_users
# mkdir ~perf/bin
# cp ~acme/bin/perf ~perf/bin/
# chgrp perf_users ~perf/bin/perf
# setcap "cap_perfmon,cap_sys_ptrace,cap_syslog=ep" ~perf/bin/perf
# getcap ~perf/bin/perf
/home/perf/bin/perf = cap_sys_ptrace,cap_syslog,cap_perfmon+ep
# ls -la ~perf/bin/perf
-rwxr-xr-x. 1 root perf_users 16968552 Apr 9 13:10 /home/perf/bin/perf
As the 'perf' user in the 'perf_users' group:
$ perf top -a --stdio
Error:
Failed to mmap with 1 (Operation not permitted)
$
Either add the cap_ipc_lock capability to the perf binary or reduce the
ring buffer size to some smaller value:
$ perf top -m10 -a --stdio
rounding mmap pages size to 64K (16 pages)
Error:
Failed to mmap with 1 (Operation not permitted)
$ perf top -m4 -a --stdio
Error:
Failed to mmap with 1 (Operation not permitted)
$ perf top -m2 -a --stdio
PerfTop: 762 irqs/sec kernel:49.7% exact: 100.0% lost: 0/0 drop: 0/0 [4000Hz cycles], (all, 4 CPUs)
------------------------------------------------------------------------------------------------------
9.83% perf [.] __symbols__insert
8.58% perf [.] rb_next
5.91% [kernel] [k] module_get_kallsym
5.66% [kernel] [k] kallsyms_expand_symbol.constprop.0
3.98% libc-2.29.so [.] __GI_____strtoull_l_internal
3.66% perf [.] rb_insert_color
2.34% [kernel] [k] vsnprintf
2.30% [kernel] [k] string_nocheck
2.16% libc-2.29.so [.] _IO_getdelim
2.15% [kernel] [k] number
2.13% [kernel] [k] format_decode
1.58% libc-2.29.so [.] _IO_feof
1.52% libc-2.29.so [.] __strcmp_avx2
1.50% perf [.] rb_set_parent_color
1.47% libc-2.29.so [.] __libc_calloc
1.24% [kernel] [k] do_syscall_64
1.17% [kernel] [k] __x86_indirect_thunk_rax
$ perf record -a sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.552 MB perf.data (74 samples) ]
$ perf evlist
cycles
$ perf evlist -v
cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1
$ perf report | head -20
# To display the perf.data header info, please use --header/--header-only options.
#
#
# Total Lost Samples: 0
#
# Samples: 74 of event 'cycles'
# Event count (approx.): 15694834
#
# Overhead Command Shared Object Symbol
# ........ ............... .......................... ......................................
#
19.62% perf [kernel.vmlinux] [k] strnlen_user
13.88% swapper [kernel.vmlinux] [k] intel_idle
13.83% ksoftirqd/0 [kernel.vmlinux] [k] pfifo_fast_dequeue
13.51% swapper [kernel.vmlinux] [k] kmem_cache_free
6.31% gnome-shell [kernel.vmlinux] [k] kmem_cache_free
5.66% kworker/u8:3+ix [kernel.vmlinux] [k] delay_tsc
4.42% perf [kernel.vmlinux] [k] __set_cpus_allowed_ptr
3.45% kworker/2:1-eve [kernel.vmlinux] [k] shmem_truncate_range
2.29% gnome-shell libgobject-2.0.so.0.6000.7 [.] g_closure_ref
$
Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Igor Lubashev <ilubashe@akamai.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: intel-gfx@lists.freedesktop.org
Cc: linux-doc@vger.kernel.org
Cc: linux-man@vger.kernel.org
Cc: linux-security-module@vger.kernel.org
Cc: selinux@vger.kernel.org
Link: http://lore.kernel.org/lkml/a66d5648-2b8e-577e-e1f2-1d56c017ab5e@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2020-04-02 16:47:35 +08:00
|
|
|
">= 0: Disallow ftrace function tracepoint by users without CAP_PERFMON or CAP_SYS_ADMIN\n"
|
|
|
|
" Disallow raw tracepoint access by users without CAP_SYS_PERFMON or CAP_SYS_ADMIN\n"
|
|
|
|
">= 1: Disallow CPU event access by users without CAP_PERFMON or CAP_SYS_ADMIN\n"
|
|
|
|
">= 2: Disallow kernel profiling by users without CAP_PERFMON or CAP_SYS_ADMIN\n\n"
|
2017-02-14 03:45:24 +08:00
|
|
|
"To make this setting permanent, edit /etc/sysctl.conf too, e.g.:\n\n"
|
|
|
|
" kernel.perf_event_paranoid = -1\n" ,
|
2016-05-13 02:44:55 +08:00
|
|
|
target->system_wide ? "system-wide " : "",
|
|
|
|
perf_event_paranoid());
|
2012-12-14 02:10:58 +08:00
|
|
|
case ENOENT:
|
2020-04-30 03:07:09 +08:00
|
|
|
return scnprintf(msg, size, "The %s event is not supported.", evsel__name(evsel));
|
2012-12-14 02:10:58 +08:00
|
|
|
case EMFILE:
|
|
|
|
return scnprintf(msg, size, "%s",
|
|
|
|
"Too many events are opened.\n"
|
2015-05-26 04:51:54 +08:00
|
|
|
"Probably the maximum number of open file descriptors has been reached.\n"
|
|
|
|
"Hint: Try again after reducing the number of events.\n"
|
|
|
|
"Hint: Try increasing the limit with 'ulimit -n <limit>'");
|
2016-04-28 04:51:45 +08:00
|
|
|
case ENOMEM:
|
2018-05-29 03:00:29 +08:00
|
|
|
if (evsel__has_callchain(evsel) &&
|
2016-04-28 04:51:45 +08:00
|
|
|
access("/proc/sys/kernel/perf_event_max_stack", F_OK) == 0)
|
|
|
|
return scnprintf(msg, size,
|
|
|
|
"Not enough memory to setup event with callchain.\n"
|
|
|
|
"Hint: Try tweaking /proc/sys/kernel/perf_event_max_stack\n"
|
2018-05-18 03:31:32 +08:00
|
|
|
"Hint: Current value: %d", sysctl__max_stack());
|
2016-04-28 04:51:45 +08:00
|
|
|
break;
|
2012-12-14 02:10:58 +08:00
|
|
|
case ENODEV:
|
|
|
|
if (target->cpu_list)
|
|
|
|
return scnprintf(msg, size, "%s",
|
2016-04-28 04:56:53 +08:00
|
|
|
"No such device - did you specify an out-of-range profile CPU?");
|
2012-12-14 02:10:58 +08:00
|
|
|
break;
|
|
|
|
case EOPNOTSUPP:
|
2019-07-21 19:24:29 +08:00
|
|
|
if (evsel->core.attr.sample_period != 0)
|
2017-11-15 05:04:52 +08:00
|
|
|
return scnprintf(msg, size,
|
|
|
|
"%s: PMU Hardware doesn't support sampling/overflow-interrupts. Try 'perf stat'",
|
2020-04-30 03:07:09 +08:00
|
|
|
evsel__name(evsel));
|
2019-07-21 19:24:29 +08:00
|
|
|
if (evsel->core.attr.precise_ip)
|
2012-12-14 02:10:58 +08:00
|
|
|
return scnprintf(msg, size, "%s",
|
|
|
|
"\'precise\' request may not be supported. Try removing 'p' modifier.");
|
|
|
|
#if defined(__i386__) || defined(__x86_64__)
|
2019-07-21 19:24:29 +08:00
|
|
|
if (evsel->core.attr.type == PERF_TYPE_HARDWARE)
|
2012-12-14 02:10:58 +08:00
|
|
|
return scnprintf(msg, size, "%s",
|
2018-04-07 04:38:12 +08:00
|
|
|
"No hardware sampling interrupt available.\n");
|
2012-12-14 02:10:58 +08:00
|
|
|
#endif
|
|
|
|
break;
|
2014-08-01 23:46:54 +08:00
|
|
|
case EBUSY:
|
|
|
|
if (find_process("oprofiled"))
|
|
|
|
return scnprintf(msg, size,
|
|
|
|
"The PMU counters are busy/taken by another profiler.\n"
|
|
|
|
"We found oprofile daemon running, please stop it and try again.");
|
|
|
|
break;
|
2015-03-31 06:19:31 +08:00
|
|
|
case EINVAL:
|
2019-07-21 19:24:29 +08:00
|
|
|
if (evsel->core.attr.write_backward && perf_missing_features.write_backward)
|
2016-06-20 18:47:18 +08:00
|
|
|
return scnprintf(msg, size, "Reading from overwrite event is not supported by this kernel.");
|
2015-03-31 06:19:31 +08:00
|
|
|
if (perf_missing_features.clockid)
|
|
|
|
return scnprintf(msg, size, "clockid feature not supported.");
|
|
|
|
if (perf_missing_features.clockid_wrong)
|
|
|
|
return scnprintf(msg, size, "wrong clockid (%d).", clockid);
|
2019-08-13 22:06:38 +08:00
|
|
|
if (perf_missing_features.aux_output)
|
|
|
|
return scnprintf(msg, size, "The 'aux_output' feature is not supported, update the kernel.");
|
2015-03-31 06:19:31 +08:00
|
|
|
break;
|
2012-12-14 02:10:58 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return scnprintf(msg, size,
|
2014-08-14 10:22:36 +08:00
|
|
|
"The sys_perf_event_open() syscall returned with %d (%s) for event (%s).\n"
|
2018-04-07 04:38:11 +08:00
|
|
|
"/bin/dmesg | grep -i perf may provide additional information.\n",
|
2020-04-30 03:07:09 +08:00
|
|
|
err, str_error_r(err, sbuf, sizeof(sbuf)), evsel__name(evsel));
|
2012-12-14 02:10:58 +08:00
|
|
|
}
|
2016-06-30 14:14:19 +08:00
|
|
|
|
2019-07-21 19:23:51 +08:00
|
|
|
struct perf_env *perf_evsel__env(struct evsel *evsel)
|
perf annotate: Check for fused instructions
Macro fusion merges two instructions to a single micro-op. Intel core
platform performs this hardware optimization under limited
circumstances.
For example, CMP + JCC can be "fused" and executed /retired together.
While with sampling this can result in the sample sometimes being on the
JCC and sometimes on the CMP. So for the fused instruction pair, they
could be considered together.
On Nehalem, fused instruction pairs:
cmp/test + jcc.
On other new CPU:
cmp/test/add/sub/and/inc/dec + jcc.
This patch adds an x86-specific function which checks if 2 instructions
are in a "fused" pair. For non-x86 arch, the function is just NULL.
Changelog:
v4: Move the CPU model checking to symbol__disassemble and save the CPU
family/model in arch structure.
It avoids checking every time when jump arrow printed.
v3: Add checking for Nehalem (CMP, TEST). For other newer Intel CPUs
just check it by default (CMP, TEST, ADD, SUB, AND, INC, DEC).
v2: Remove the original weak function. Arnaldo points out that doing it
as a weak function that will be overridden by the host arch doesn't
work. So now it's implemented as an arch-specific function.
Committer fix:
Do not access evsel->evlist->env->cpuid, ->env can be null, introduce
perf_evsel__env_cpuid(), just like perf_evsel__env_arch(), also used in
this function call.
The original patch was segfaulting 'perf top' + annotation.
But this essentially disables this fused instructions augmentation in
'perf top', the right thing is to get the cpuid from the running kernel,
left for a later patch tho.
Signed-off-by: Yao Jin <yao.jin@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1499403995-19857-2-git-send-email-yao.jin@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-07-07 13:06:34 +08:00
|
|
|
{
|
2017-12-11 23:46:11 +08:00
|
|
|
if (evsel && evsel->evlist)
|
|
|
|
return evsel->evlist->env;
|
2019-09-30 22:48:32 +08:00
|
|
|
return &perf_env;
|
perf annotate: Check for fused instructions
Macro fusion merges two instructions to a single micro-op. Intel core
platform performs this hardware optimization under limited
circumstances.
For example, CMP + JCC can be "fused" and executed /retired together.
While with sampling this can result in the sample sometimes being on the
JCC and sometimes on the CMP. So for the fused instruction pair, they
could be considered together.
On Nehalem, fused instruction pairs:
cmp/test + jcc.
On other new CPU:
cmp/test/add/sub/and/inc/dec + jcc.
This patch adds an x86-specific function which checks if 2 instructions
are in a "fused" pair. For non-x86 arch, the function is just NULL.
Changelog:
v4: Move the CPU model checking to symbol__disassemble and save the CPU
family/model in arch structure.
It avoids checking every time when jump arrow printed.
v3: Add checking for Nehalem (CMP, TEST). For other newer Intel CPUs
just check it by default (CMP, TEST, ADD, SUB, AND, INC, DEC).
v2: Remove the original weak function. Arnaldo points out that doing it
as a weak function that will be overridden by the host arch doesn't
work. So now it's implemented as an arch-specific function.
Committer fix:
Do not access evsel->evlist->env->cpuid, ->env can be null, introduce
perf_evsel__env_cpuid(), just like perf_evsel__env_arch(), also used in
this function call.
The original patch was segfaulting 'perf top' + annotation.
But this essentially disables this fused instructions augmentation in
'perf top', the right thing is to get the cpuid from the running kernel,
left for a later patch tho.
Signed-off-by: Yao Jin <yao.jin@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1499403995-19857-2-git-send-email-yao.jin@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-07-07 13:06:34 +08:00
|
|
|
}
|
2018-08-30 14:32:16 +08:00
|
|
|
|
2019-07-21 19:23:52 +08:00
|
|
|
static int store_evsel_ids(struct evsel *evsel, struct evlist *evlist)
|
2018-08-30 14:32:16 +08:00
|
|
|
{
|
|
|
|
int cpu, thread;
|
|
|
|
|
2019-07-21 19:24:45 +08:00
|
|
|
for (cpu = 0; cpu < xyarray__max_x(evsel->core.fd); cpu++) {
|
|
|
|
for (thread = 0; thread < xyarray__max_y(evsel->core.fd);
|
2018-08-30 14:32:16 +08:00
|
|
|
thread++) {
|
|
|
|
int fd = FD(evsel, cpu, thread);
|
|
|
|
|
2019-09-03 17:19:56 +08:00
|
|
|
if (perf_evlist__id_add_fd(&evlist->core, &evsel->core,
|
2018-08-30 14:32:16 +08:00
|
|
|
cpu, thread, fd) < 0)
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-07-21 19:23:52 +08:00
|
|
|
int perf_evsel__store_ids(struct evsel *evsel, struct evlist *evlist)
|
2018-08-30 14:32:16 +08:00
|
|
|
{
|
2019-07-21 19:24:37 +08:00
|
|
|
struct perf_cpu_map *cpus = evsel->core.cpus;
|
2019-07-21 19:24:39 +08:00
|
|
|
struct perf_thread_map *threads = evsel->core.threads;
|
2018-08-30 14:32:16 +08:00
|
|
|
|
2019-09-03 16:34:29 +08:00
|
|
|
if (perf_evsel__alloc_id(&evsel->core, cpus->nr, threads->nr))
|
2018-08-30 14:32:16 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
return store_evsel_ids(evsel, evlist);
|
|
|
|
}
|