perf stat: Introduce --per-thread option
Currently all the -p option PID arguments tasks values get aggregated and printed as single values. Adding --per-tasks option to print values per task. $ perf stat -e cycles,instructions --per-thread -p 30190,30242 ^C Performance counter stats for process id '30190,30242': cat-30190 0 cycles yes-30242 3,842,525,421 cycles cat-30190 0 instructions yes-30242 10,370,817,010 instructions 1.143155657 seconds time elapsed Also works under interval mode: $ perf stat -e cycles,instructions --per-thread -p 30190,30242 -I 1000 # time comm-pid counts unit events 1.000073435 cat-30190 89,058 cycles 1.000073435 yes-30242 3,360,786,902 cycles (100.00%) 1.000073435 cat-30190 14,066 instructions 1.000073435 yes-30242 9,069,937,462 instructions 2.000204830 cat-30190 0 cycles 2.000204830 yes-30242 3,351,667,626 cycles 2.000204830 cat-30190 0 instructions 2.000204830 yes-30242 9,045,796,885 instructions ^C 2.771286639 cat-30190 0 cycles 2.771286639 yes-30242 2,593,884,166 cycles 2.771286639 cat-30190 0 instructions 2.771286639 yes-30242 7,001,171,191 instructions It works only with -t and -p options, otherwise following error is printed: $ perf stat -e cycles --per-thread -I 1000 ls The --per-thread option is only available when monitoring via -p -t options. -p, --pid <pid> stat events on existing process id -t, --tid <tid> stat events on existing thread id Signed-off-by: Jiri Olsa <jolsa@kernel.org> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: David Ahern <dsahern@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Stephane Eranian <eranian@google.com> Link: http://lkml.kernel.org/r/1435310967-14570-23-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
parent
d4f63a4741
commit
32b8af82e3
|
@ -144,6 +144,10 @@ is a useful mode to detect imbalance between physical cores. To enable this mod
|
|||
use --per-core in addition to -a. (system-wide). The output includes the
|
||||
core number and the number of online logical processors on that physical processor.
|
||||
|
||||
--per-thread::
|
||||
Aggregate counts per monitored threads, when monitoring threads (-t option)
|
||||
or processes (-p option).
|
||||
|
||||
-D msecs::
|
||||
--delay msecs::
|
||||
After starting the program, wait msecs before measuring. This is useful to
|
||||
|
|
|
@ -231,6 +231,7 @@ process_counter_values(struct perf_evsel *evsel, int cpu, int thread,
|
|||
count = &zero;
|
||||
|
||||
switch (aggr_mode) {
|
||||
case AGGR_THREAD:
|
||||
case AGGR_CORE:
|
||||
case AGGR_SOCKET:
|
||||
case AGGR_NONE:
|
||||
|
@ -602,6 +603,14 @@ static void aggr_printout(struct perf_evsel *evsel, int id, int nr)
|
|||
csv_output ? 0 : -4,
|
||||
perf_evsel__cpus(evsel)->map[id], csv_sep);
|
||||
break;
|
||||
case AGGR_THREAD:
|
||||
fprintf(output, "%*s-%*d%s",
|
||||
csv_output ? 0 : 16,
|
||||
thread_map__comm(evsel->threads, id),
|
||||
csv_output ? 0 : -8,
|
||||
thread_map__pid(evsel->threads, id),
|
||||
csv_sep);
|
||||
break;
|
||||
case AGGR_GLOBAL:
|
||||
default:
|
||||
break;
|
||||
|
@ -750,6 +759,40 @@ static void print_aggr(char *prefix)
|
|||
}
|
||||
}
|
||||
|
||||
static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
|
||||
{
|
||||
int nthreads = thread_map__nr(counter->threads);
|
||||
int ncpus = cpu_map__nr(counter->cpus);
|
||||
int cpu, thread;
|
||||
double uval;
|
||||
|
||||
for (thread = 0; thread < nthreads; thread++) {
|
||||
u64 ena = 0, run = 0, val = 0;
|
||||
|
||||
for (cpu = 0; cpu < ncpus; cpu++) {
|
||||
val += perf_counts(counter->counts, cpu, thread)->val;
|
||||
ena += perf_counts(counter->counts, cpu, thread)->ena;
|
||||
run += perf_counts(counter->counts, cpu, thread)->run;
|
||||
}
|
||||
|
||||
if (prefix)
|
||||
fprintf(output, "%s", prefix);
|
||||
|
||||
uval = val * counter->scale;
|
||||
|
||||
if (nsec_counter(counter))
|
||||
nsec_printout(thread, 0, counter, uval);
|
||||
else
|
||||
abs_printout(thread, 0, counter, uval);
|
||||
|
||||
if (!csv_output)
|
||||
print_noise(counter, 1.0);
|
||||
|
||||
print_running(run, ena);
|
||||
fputc('\n', output);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Print out the results of a single counter:
|
||||
* aggregated counts in system-wide mode
|
||||
|
@ -876,6 +919,9 @@ static void print_interval(char *prefix, struct timespec *ts)
|
|||
case AGGR_NONE:
|
||||
fprintf(output, "# time CPU counts %*s events\n", unit_width, "unit");
|
||||
break;
|
||||
case AGGR_THREAD:
|
||||
fprintf(output, "# time comm-pid counts %*s events\n", unit_width, "unit");
|
||||
break;
|
||||
case AGGR_GLOBAL:
|
||||
default:
|
||||
fprintf(output, "# time counts %*s events\n", unit_width, "unit");
|
||||
|
@ -944,6 +990,10 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
|
|||
case AGGR_SOCKET:
|
||||
print_aggr(prefix);
|
||||
break;
|
||||
case AGGR_THREAD:
|
||||
evlist__for_each(evsel_list, counter)
|
||||
print_aggr_thread(counter, prefix);
|
||||
break;
|
||||
case AGGR_GLOBAL:
|
||||
evlist__for_each(evsel_list, counter)
|
||||
print_counter_aggr(counter, prefix);
|
||||
|
@ -1031,6 +1081,7 @@ static int perf_stat_init_aggr_mode(void)
|
|||
break;
|
||||
case AGGR_NONE:
|
||||
case AGGR_GLOBAL:
|
||||
case AGGR_THREAD:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -1255,6 +1306,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
|
|||
"aggregate counts per processor socket", AGGR_SOCKET),
|
||||
OPT_SET_UINT(0, "per-core", &aggr_mode,
|
||||
"aggregate counts per physical processor core", AGGR_CORE),
|
||||
OPT_SET_UINT(0, "per-thread", &aggr_mode,
|
||||
"aggregate counts per thread", AGGR_THREAD),
|
||||
OPT_UINTEGER('D', "delay", &initial_delay,
|
||||
"ms to wait before starting measurement after program start"),
|
||||
OPT_END()
|
||||
|
@ -1346,8 +1399,19 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
|
|||
run_count = 1;
|
||||
}
|
||||
|
||||
/* no_aggr, cgroup are for system-wide only */
|
||||
if ((aggr_mode != AGGR_GLOBAL || nr_cgroups) &&
|
||||
if ((aggr_mode == AGGR_THREAD) && !target__has_task(&target)) {
|
||||
fprintf(stderr, "The --per-thread option is only available "
|
||||
"when monitoring via -p -t options.\n");
|
||||
parse_options_usage(NULL, options, "p", 1);
|
||||
parse_options_usage(NULL, options, "t", 1);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* no_aggr, cgroup are for system-wide only
|
||||
* --per-thread is aggregated per thread, we dont mix it with cpu mode
|
||||
*/
|
||||
if (((aggr_mode != AGGR_GLOBAL && aggr_mode != AGGR_THREAD) || nr_cgroups) &&
|
||||
!target__has_cpu(&target)) {
|
||||
fprintf(stderr, "both cgroup and no-aggregation "
|
||||
"modes only available in system-wide mode\n");
|
||||
|
@ -1375,6 +1439,14 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
|
|||
}
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize thread_map with comm names,
|
||||
* so we could print it out on output.
|
||||
*/
|
||||
if (aggr_mode == AGGR_THREAD)
|
||||
thread_map__read_comms(evsel_list->threads);
|
||||
|
||||
if (interval && interval < 100) {
|
||||
pr_err("print interval must be >= 100ms\n");
|
||||
parse_options_usage(stat_usage, options, "I", 1);
|
||||
|
|
|
@ -30,6 +30,7 @@ enum aggr_mode {
|
|||
AGGR_GLOBAL,
|
||||
AGGR_SOCKET,
|
||||
AGGR_CORE,
|
||||
AGGR_THREAD,
|
||||
};
|
||||
|
||||
struct perf_counts_values {
|
||||
|
|
Loading…
Reference in New Issue