perf top: Implement multithreading for perf_event__synthesize_threads
The proc files which is sorted with alphabetical order are evenly assigned to several synthesize threads to be processed in parallel. For 'perf top', the threads number hard code to online CPU number. The following patch will introduce an option to set it. For other perf tools, the thread number is 1. Because the process function is not ready for multithreading, e.g. process_synthesized_event. This patch series only support event synthesize multithreading for 'perf top'. For other tools, it can be done separately later. With multithread applied, the total processing time can get up to 1.56x speedup on Knights Mill for 'perf top'. For specific single event processing, the processing time could increase because of the lock contention. So proc_map_timeout may need to be increased. Otherwise some proc maps will be truncated. Based on my test, increasing the proc_map_timeout has small impact on the total processing time. The total processing time still get 1.49x speedup on Knights Mill after increasing the proc_map_timeout. The patch itself doesn't increase the proc_map_timeout. Doesn't need to implement multithreading for per task monitoring, perf_event__synthesize_thread_map. It doesn't have performance issue. Committer testing: # getconf _NPROCESSORS_ONLN 4 # perf trace --no-inherit -e clone -o /tmp/output perf top # tail -4 /tmp/bla 0.124 ( 0.041 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3eb3a8f30, parent_tidptr: 0x7fc3eb3a99d0, child_tidptr: 0x7fc3eb3a99d0, tls: 0x7fc3eb3a9700) = 9548 (perf) 0.246 ( 0.023 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3eaba7f30, parent_tidptr: 0x7fc3eaba89d0, child_tidptr: 0x7fc3eaba89d0, tls: 0x7fc3eaba8700) = 9549 (perf) 0.286 ( 0.019 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3ea3a6f30, parent_tidptr: 0x7fc3ea3a79d0, child_tidptr: 0x7fc3ea3a79d0, tls: 0x7fc3ea3a7700) = 9550 (perf) 246.540 ( 0.047 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3ea3a6f30, parent_tidptr: 0x7fc3ea3a79d0, child_tidptr: 0x7fc3ea3a79d0, tls: 0x7fc3ea3a7700) = 9551 (perf) # Signed-off-by: Kan Liang <kan.liang@intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Andi Kleen <ak@linux.intel.com> Cc: He Kuang <hekuang@huawei.com> Cc: Lukasz Odzioba <lukasz.odzioba@intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/r/1506696477-146932-4-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
parent
f988e71bc6
commit
340b47f510
|
@ -1441,7 +1441,8 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
|
|||
perf_session__set_id_hdr_size(kvm->session);
|
||||
ordered_events__set_copy_on_queue(&kvm->session->ordered_events, true);
|
||||
machine__synthesize_threads(&kvm->session->machines.host, &kvm->opts.target,
|
||||
kvm->evlist->threads, false, kvm->opts.proc_map_timeout);
|
||||
kvm->evlist->threads, false,
|
||||
kvm->opts.proc_map_timeout, 1);
|
||||
err = kvm_live_open_events(kvm);
|
||||
if (err)
|
||||
goto out;
|
||||
|
|
|
@ -863,7 +863,7 @@ static int record__synthesize(struct record *rec, bool tail)
|
|||
|
||||
err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
|
||||
process_synthesized_event, opts->sample_address,
|
||||
opts->proc_map_timeout);
|
||||
opts->proc_map_timeout, 1);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
|
|
@ -958,8 +958,14 @@ static int __cmd_top(struct perf_top *top)
|
|||
if (perf_session__register_idle_thread(top->session) < 0)
|
||||
goto out_delete;
|
||||
|
||||
perf_set_multithreaded();
|
||||
|
||||
machine__synthesize_threads(&top->session->machines.host, &opts->target,
|
||||
top->evlist->threads, false, opts->proc_map_timeout);
|
||||
top->evlist->threads, false,
|
||||
opts->proc_map_timeout,
|
||||
(unsigned int)sysconf(_SC_NPROCESSORS_ONLN));
|
||||
|
||||
perf_set_singlethreaded();
|
||||
|
||||
if (perf_hpp_list.socket) {
|
||||
ret = perf_env__read_cpu_topology_map(&perf_env);
|
||||
|
|
|
@ -1131,7 +1131,7 @@ static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
|
|||
|
||||
err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
|
||||
evlist->threads, trace__tool_process, false,
|
||||
trace->opts.proc_map_timeout);
|
||||
trace->opts.proc_map_timeout, 1);
|
||||
if (err)
|
||||
symbol__exit();
|
||||
|
||||
|
|
|
@ -131,7 +131,7 @@ static int synth_all(struct machine *machine)
|
|||
{
|
||||
return perf_event__synthesize_threads(NULL,
|
||||
perf_event__process,
|
||||
machine, 0, 500);
|
||||
machine, 0, 500, 1);
|
||||
}
|
||||
|
||||
static int synth_process(struct machine *machine)
|
||||
|
|
|
@ -678,23 +678,21 @@ out:
|
|||
return err;
|
||||
}
|
||||
|
||||
int perf_event__synthesize_threads(struct perf_tool *tool,
|
||||
perf_event__handler_t process,
|
||||
struct machine *machine,
|
||||
bool mmap_data,
|
||||
unsigned int proc_map_timeout)
|
||||
static int __perf_event__synthesize_threads(struct perf_tool *tool,
|
||||
perf_event__handler_t process,
|
||||
struct machine *machine,
|
||||
bool mmap_data,
|
||||
unsigned int proc_map_timeout,
|
||||
struct dirent **dirent,
|
||||
int start,
|
||||
int num)
|
||||
{
|
||||
union perf_event *comm_event, *mmap_event, *fork_event;
|
||||
union perf_event *namespaces_event;
|
||||
char proc_path[PATH_MAX];
|
||||
struct dirent **dirent;
|
||||
int err = -1;
|
||||
char *end;
|
||||
pid_t pid;
|
||||
int n, i;
|
||||
|
||||
if (machine__is_default_guest(machine))
|
||||
return 0;
|
||||
int i;
|
||||
|
||||
comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
|
||||
if (comm_event == NULL)
|
||||
|
@ -714,34 +712,25 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
|
|||
if (namespaces_event == NULL)
|
||||
goto out_free_fork;
|
||||
|
||||
snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir);
|
||||
n = scandir(proc_path, &dirent, 0, alphasort);
|
||||
|
||||
if (n < 0)
|
||||
goto out_free_namespaces;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
for (i = start; i < start + num; i++) {
|
||||
if (!isdigit(dirent[i]->d_name[0]))
|
||||
continue;
|
||||
|
||||
pid = (pid_t)strtol(dirent[i]->d_name, &end, 10);
|
||||
/* only interested in proper numerical dirents */
|
||||
if (!*end) {
|
||||
/*
|
||||
* We may race with exiting thread, so don't stop just because
|
||||
* one thread couldn't be synthesized.
|
||||
*/
|
||||
__event__synthesize_thread(comm_event, mmap_event, fork_event,
|
||||
namespaces_event, pid, 1, process,
|
||||
tool, machine, mmap_data,
|
||||
proc_map_timeout);
|
||||
}
|
||||
free(dirent[i]);
|
||||
if (*end)
|
||||
continue;
|
||||
/*
|
||||
* We may race with exiting thread, so don't stop just because
|
||||
* one thread couldn't be synthesized.
|
||||
*/
|
||||
__event__synthesize_thread(comm_event, mmap_event, fork_event,
|
||||
namespaces_event, pid, 1, process,
|
||||
tool, machine, mmap_data,
|
||||
proc_map_timeout);
|
||||
}
|
||||
free(dirent);
|
||||
err = 0;
|
||||
|
||||
out_free_namespaces:
|
||||
free(namespaces_event);
|
||||
out_free_fork:
|
||||
free(fork_event);
|
||||
|
@ -753,6 +742,115 @@ out:
|
|||
return err;
|
||||
}
|
||||
|
||||
struct synthesize_threads_arg {
|
||||
struct perf_tool *tool;
|
||||
perf_event__handler_t process;
|
||||
struct machine *machine;
|
||||
bool mmap_data;
|
||||
unsigned int proc_map_timeout;
|
||||
struct dirent **dirent;
|
||||
int num;
|
||||
int start;
|
||||
};
|
||||
|
||||
static void *synthesize_threads_worker(void *arg)
|
||||
{
|
||||
struct synthesize_threads_arg *args = arg;
|
||||
|
||||
__perf_event__synthesize_threads(args->tool, args->process,
|
||||
args->machine, args->mmap_data,
|
||||
args->proc_map_timeout, args->dirent,
|
||||
args->start, args->num);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int perf_event__synthesize_threads(struct perf_tool *tool,
|
||||
perf_event__handler_t process,
|
||||
struct machine *machine,
|
||||
bool mmap_data,
|
||||
unsigned int proc_map_timeout,
|
||||
unsigned int nr_threads_synthesize)
|
||||
{
|
||||
struct synthesize_threads_arg *args = NULL;
|
||||
pthread_t *synthesize_threads = NULL;
|
||||
char proc_path[PATH_MAX];
|
||||
struct dirent **dirent;
|
||||
int num_per_thread;
|
||||
int m, n, i, j;
|
||||
int thread_nr;
|
||||
int base = 0;
|
||||
int err = -1;
|
||||
|
||||
|
||||
if (machine__is_default_guest(machine))
|
||||
return 0;
|
||||
|
||||
snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir);
|
||||
n = scandir(proc_path, &dirent, 0, alphasort);
|
||||
if (n < 0)
|
||||
return err;
|
||||
|
||||
thread_nr = nr_threads_synthesize;
|
||||
|
||||
if (thread_nr <= 1) {
|
||||
err = __perf_event__synthesize_threads(tool, process,
|
||||
machine, mmap_data,
|
||||
proc_map_timeout,
|
||||
dirent, base, n);
|
||||
goto free_dirent;
|
||||
}
|
||||
if (thread_nr > n)
|
||||
thread_nr = n;
|
||||
|
||||
synthesize_threads = calloc(sizeof(pthread_t), thread_nr);
|
||||
if (synthesize_threads == NULL)
|
||||
goto free_dirent;
|
||||
|
||||
args = calloc(sizeof(*args), thread_nr);
|
||||
if (args == NULL)
|
||||
goto free_threads;
|
||||
|
||||
num_per_thread = n / thread_nr;
|
||||
m = n % thread_nr;
|
||||
for (i = 0; i < thread_nr; i++) {
|
||||
args[i].tool = tool;
|
||||
args[i].process = process;
|
||||
args[i].machine = machine;
|
||||
args[i].mmap_data = mmap_data;
|
||||
args[i].proc_map_timeout = proc_map_timeout;
|
||||
args[i].dirent = dirent;
|
||||
}
|
||||
for (i = 0; i < m; i++) {
|
||||
args[i].num = num_per_thread + 1;
|
||||
args[i].start = i * args[i].num;
|
||||
}
|
||||
if (i != 0)
|
||||
base = args[i-1].start + args[i-1].num;
|
||||
for (j = i; j < thread_nr; j++) {
|
||||
args[j].num = num_per_thread;
|
||||
args[j].start = base + (j - i) * args[i].num;
|
||||
}
|
||||
|
||||
for (i = 0; i < thread_nr; i++) {
|
||||
if (pthread_create(&synthesize_threads[i], NULL,
|
||||
synthesize_threads_worker, &args[i]))
|
||||
goto out_join;
|
||||
}
|
||||
err = 0;
|
||||
out_join:
|
||||
for (i = 0; i < thread_nr; i++)
|
||||
pthread_join(synthesize_threads[i], NULL);
|
||||
free(args);
|
||||
free_threads:
|
||||
free(synthesize_threads);
|
||||
free_dirent:
|
||||
for (i = 0; i < n; i++)
|
||||
free(dirent[i]);
|
||||
free(dirent);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
struct process_symbol_args {
|
||||
const char *name;
|
||||
u64 start;
|
||||
|
|
|
@ -680,7 +680,8 @@ int perf_event__synthesize_cpu_map(struct perf_tool *tool,
|
|||
int perf_event__synthesize_threads(struct perf_tool *tool,
|
||||
perf_event__handler_t process,
|
||||
struct machine *machine, bool mmap_data,
|
||||
unsigned int proc_map_timeout);
|
||||
unsigned int proc_map_timeout,
|
||||
unsigned int nr_threads_synthesize);
|
||||
int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
|
||||
perf_event__handler_t process,
|
||||
struct machine *machine);
|
||||
|
|
|
@ -2218,12 +2218,16 @@ int machines__for_each_thread(struct machines *machines,
|
|||
int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
|
||||
struct target *target, struct thread_map *threads,
|
||||
perf_event__handler_t process, bool data_mmap,
|
||||
unsigned int proc_map_timeout)
|
||||
unsigned int proc_map_timeout,
|
||||
unsigned int nr_threads_synthesize)
|
||||
{
|
||||
if (target__has_task(target))
|
||||
return perf_event__synthesize_thread_map(tool, threads, process, machine, data_mmap, proc_map_timeout);
|
||||
else if (target__has_cpu(target))
|
||||
return perf_event__synthesize_threads(tool, process, machine, data_mmap, proc_map_timeout);
|
||||
return perf_event__synthesize_threads(tool, process,
|
||||
machine, data_mmap,
|
||||
proc_map_timeout,
|
||||
nr_threads_synthesize);
|
||||
/* command specified */
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -257,15 +257,18 @@ int machines__for_each_thread(struct machines *machines,
|
|||
int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
|
||||
struct target *target, struct thread_map *threads,
|
||||
perf_event__handler_t process, bool data_mmap,
|
||||
unsigned int proc_map_timeout);
|
||||
unsigned int proc_map_timeout,
|
||||
unsigned int nr_threads_synthesize);
|
||||
static inline
|
||||
int machine__synthesize_threads(struct machine *machine, struct target *target,
|
||||
struct thread_map *threads, bool data_mmap,
|
||||
unsigned int proc_map_timeout)
|
||||
unsigned int proc_map_timeout,
|
||||
unsigned int nr_threads_synthesize)
|
||||
{
|
||||
return __machine__synthesize_threads(machine, NULL, target, threads,
|
||||
perf_event__process, data_mmap,
|
||||
proc_map_timeout);
|
||||
proc_map_timeout,
|
||||
nr_threads_synthesize);
|
||||
}
|
||||
|
||||
pid_t machine__get_current_tid(struct machine *machine, int cpu);
|
||||
|
|
Loading…
Reference in New Issue