perf/core improvements and fixes:

perf annotate: Wei Li: - Fix getting source line failure perf script: Andi Kleen: - Handle missing fields with -F +... perf data: Jiri Olsa: - Prep work to support per-cpu files in a directory. Intel PT: Adrian Hunter: - Improve thread_stack__no_call_return() - Hide x86 retpolines in thread stacks. - exported SQL viewer refactorings, new 'top calls' report.. Alexander Shishkin: - Copy parent's address filter offsets on clone - Fix address filters for vmas with non-zero offset. Applies to ARM's CoreSight as well. python scripts: Tony Jones: - Python3 support for several 'perf script' python scripts. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> -----BEGIN PGP SIGNATURE----- iHUEABYIAB0WIQR2GiIUctdOfX2qHhGyPKLppCJ+JwUCXHRYNwAKCRCyPKLppCJ+ J8XmAQDKY7gb3GhkX+4aE8cGffFYB2YV5mD9Bbu4AM9tuFFBJwD+KAq87FMCy7m7 h7xyWk3UILpz6y235AVdfOmgcNDkpAQ= =SJCG -----END PGP SIGNATURE----- Merge tag 'perf-core-for-mingo-5.1-20190225' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo: perf annotate: Wei Li: - Fix getting source line failure perf script: Andi Kleen: - Handle missing fields with -F +... perf data: Jiri Olsa: - Prep work to support per-cpu files in a directory. Intel PT: Adrian Hunter: - Improve thread_stack__no_call_return() - Hide x86 retpolines in thread stacks. - exported SQL viewer refactorings, new 'top calls' report.. Alexander Shishkin: - Copy parent's address filter offsets on clone - Fix address filters for vmas with non-zero offset. Applies to ARM's CoreSight as well. python scripts: Tony Jones: - Python3 support for several 'perf script' python scripts. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-02-28 08:29:50 +01:00 · 2019-02-28 08:29:50 +01:00 · c978b9460f
parent 0a1571243d de667cce7f
commit c978b9460f
41 changed files with 1019 additions and 429 deletions
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@ -1223,7 +1223,8 @@ static int pt_event_addr_filters_validate(struct list_head *filters)
 static void pt_event_addr_filters_sync(struct perf_event *event)
 {
 	struct perf_addr_filters_head *head = perf_event_addr_filters(event);
-	unsigned long msr_a, msr_b, *offs = event->addr_filters_offs;
+	unsigned long msr_a, msr_b;
+	struct perf_addr_filter_range *fr = event->addr_filter_ranges;
 	struct pt_filters *filters = event->hw.addr_filters;
 	struct perf_addr_filter *filter;
 	int range = 0;
@ -1232,12 +1233,12 @@ static void pt_event_addr_filters_sync(struct perf_event *event)
 		return;

 	list_for_each_entry(filter, &head->list, entry) {
-		if (filter->path.dentry && !offs[range]) {
+		if (filter->path.dentry && !fr[range].start) {
 			msr_a = msr_b = 0;
 		} else {
 			/* apply the offset */
-			msr_a = filter->offset + offs[range];
-			msr_b = filter->size + msr_a - 1;
+			msr_a = fr[range].start;
+			msr_b = msr_a + fr[range].size - 1;
 		}

 		filters->filter[range].msr_a  = msr_a;
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@ -433,15 +433,16 @@ static int etm_addr_filters_validate(struct list_head *filters)
 static void etm_addr_filters_sync(struct perf_event *event)
 {
 	struct perf_addr_filters_head *head = perf_event_addr_filters(event);
-	unsigned long start, stop, *offs = event->addr_filters_offs;
+	unsigned long start, stop;
+	struct perf_addr_filter_range *fr = event->addr_filter_ranges;
 	struct etm_filters *filters = event->hw.addr_filters;
 	struct etm_filter *etm_filter;
 	struct perf_addr_filter *filter;
 	int i = 0;

 	list_for_each_entry(filter, &head->list, entry) {
-		start = filter->offset + offs[i];
-		stop = start + filter->size;
+		start = fr[i].start;
+		stop = start + fr[i].size;
 		etm_filter = &filters->etm_filter[i];

 		switch (filter->action) {
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@ -495,6 +495,11 @@ struct perf_addr_filters_head {
 	unsigned int		nr_file_filters;
 };

+struct perf_addr_filter_range {
+	unsigned long		start;
+	unsigned long		size;
+};
+
 /**
 * enum perf_event_state - the states of an event:
 */
@ -671,7 +676,7 @@ struct perf_event {
 	/* address range filters */
 	struct perf_addr_filters_head	addr_filters;
 	/* vma address array for file-based filders */
-	unsigned long			*addr_filters_offs;
+	struct perf_addr_filter_range	*addr_filter_ranges;
 	unsigned long			addr_filters_gen;

 	void (*destroy)(struct perf_event *);
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@ -1255,6 +1255,7 @@ static void put_ctx(struct perf_event_context *ctx)
 *	      perf_event_context::lock
 *	    perf_event::mmap_mutex
 *	    mmap_sem
+ *	      perf_addr_filters_head::lock
 *
 *    cpu_hotplug_lock
 *      pmus_lock
@ -2798,7 +2799,7 @@ static int perf_event_stop(struct perf_event *event, int restart)
 *
 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
 *      we update the addresses of corresponding vmas in
- *	event::addr_filters_offs array and bump the event::addr_filters_gen;
+ *	event::addr_filter_ranges array and bump the event::addr_filters_gen;
 * (p2) when an event is scheduled in (pmu::add), it calls
 *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
 *      if the generation has changed since the previous call.
@ -4445,7 +4446,7 @@ static void _free_event(struct perf_event *event)

 	perf_event_free_bpf_prog(event);
 	perf_addr_filters_splice(event, NULL);
-	kfree(event->addr_filters_offs);
+	kfree(event->addr_filter_ranges);

 	if (event->destroy)
 		event->destroy(event);
@ -6694,7 +6695,8 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
 	raw_spin_lock_irqsave(&ifh->lock, flags);
 	list_for_each_entry(filter, &ifh->list, entry) {
 		if (filter->path.dentry) {
-			event->addr_filters_offs[count] = 0;
+			event->addr_filter_ranges[count].start = 0;
+			event->addr_filter_ranges[count].size = 0;
 			restart++;
 		}

@ -7374,28 +7376,47 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
 	return true;
 }

+static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
+					struct vm_area_struct *vma,
+					struct perf_addr_filter_range *fr)
+{
+	unsigned long vma_size = vma->vm_end - vma->vm_start;
+	unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+	struct file *file = vma->vm_file;
+
+	if (!perf_addr_filter_match(filter, file, off, vma_size))
+		return false;
+
+	if (filter->offset < off) {
+		fr->start = vma->vm_start;
+		fr->size = min(vma_size, filter->size - (off - filter->offset));
+	} else {
+		fr->start = vma->vm_start + filter->offset - off;
+		fr->size = min(vma->vm_end - fr->start, filter->size);
+	}
+
+	return true;
+}
+
 static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
 {
 	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
 	struct vm_area_struct *vma = data;
-	unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
-	struct file *file = vma->vm_file;
 	struct perf_addr_filter *filter;
 	unsigned int restart = 0, count = 0;
+	unsigned long flags;

 	if (!has_addr_filter(event))
 		return;

-	if (!file)
+	if (!vma->vm_file)
 		return;

 	raw_spin_lock_irqsave(&ifh->lock, flags);
 	list_for_each_entry(filter, &ifh->list, entry) {
-		if (perf_addr_filter_match(filter, file, off,
-					     vma->vm_end - vma->vm_start)) {
-			event->addr_filters_offs[count] = vma->vm_start;
+		if (perf_addr_filter_vma_adjust(filter, vma,
+						&event->addr_filter_ranges[count]))
 			restart++;
-		}

 		count++;
 	}
@ -8985,26 +9006,19 @@ static void perf_addr_filters_splice(struct perf_event *event,
 * @filter; if so, adjust filter's address range.
 * Called with mm::mmap_sem down for reading.
 */
-static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
-					    struct mm_struct *mm)
+static void perf_addr_filter_apply(struct perf_addr_filter *filter,
+				   struct mm_struct *mm,
+				   struct perf_addr_filter_range *fr)
 {
 	struct vm_area_struct *vma;

 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
-		struct file *file = vma->vm_file;
-		unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
-		unsigned long vma_size = vma->vm_end - vma->vm_start;
-
-		if (!file)
+		if (!vma->vm_file)
 			continue;

-		if (!perf_addr_filter_match(filter, file, off, vma_size))
-			continue;
-
-		return vma->vm_start;
+		if (perf_addr_filter_vma_adjust(filter, vma, fr))
+			return;
 	}
-
-	return 0;
 }

 /*
@ -9038,15 +9052,15 @@ static void perf_event_addr_filters_apply(struct perf_event *event)

 	raw_spin_lock_irqsave(&ifh->lock, flags);
 	list_for_each_entry(filter, &ifh->list, entry) {
-		event->addr_filters_offs[count] = 0;
+		event->addr_filter_ranges[count].start = 0;
+		event->addr_filter_ranges[count].size = 0;

 		/*
 		 * Adjust base offset if the filter is associated to a binary
 		 * that needs to be mapped:
 		 */
 		if (filter->path.dentry)
-			event->addr_filters_offs[count] =
-				perf_addr_filter_apply(filter, mm);
+			perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);

 		count++;
 	}
@ -10320,14 +10334,28 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		goto err_pmu;

 	if (has_addr_filter(event)) {
-		event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
-						   sizeof(unsigned long),
-						   GFP_KERNEL);
-		if (!event->addr_filters_offs) {
+		event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
+						    sizeof(struct perf_addr_filter_range),
+						    GFP_KERNEL);
+		if (!event->addr_filter_ranges) {
 			err = -ENOMEM;
 			goto err_per_task;
 		}

+		/*
+		 * Clone the parent's vma offsets: they are valid until exec()
+		 * even if the mm is not shared with the parent.
+		 */
+		if (event->parent) {
+			struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+
+			raw_spin_lock_irq(&ifh->lock);
+			memcpy(event->addr_filter_ranges,
+			       event->parent->addr_filter_ranges,
+			       pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
+			raw_spin_unlock_irq(&ifh->lock);
+		}
+
 		/* force hw sync on the address filters */
 		event->addr_filters_gen = 1;
 	}
@ -10346,7 +10374,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	return event;

 err_addr_filters:
-	kfree(event->addr_filters_offs);
+	kfree(event->addr_filter_ranges);

 err_per_task:
 	exclusive_event_destroy(event);
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@ -441,7 +441,7 @@ static int __cmd_annotate(struct perf_annotate *ann)
 	}

 	if (total_nr_samples == 0) {
-		ui__error("The %s file has no samples!\n", session->data->file.path);
+		ui__error("The %s data has no samples!\n", session->data->path);
 		goto out;
 	}

@ -578,7 +578,7 @@ int cmd_annotate(int argc, const char **argv)
 	if (quiet)
 		perf_quiet_option();

-	data.file.path = input_name;
+	data.path = input_name;

 	annotate.session = perf_session__new(&data, false, &annotate.tool);
 	if (annotate.session == NULL)
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@ -416,8 +416,8 @@ int cmd_buildid_cache(int argc, const char **argv)
 		nsi = nsinfo__new(ns_id);

 	if (missing_filename) {
-		data.file.path = missing_filename;
-		data.force     = force;
+		data.path  = missing_filename;
+		data.force = force;

 		session = perf_session__new(&data, false, NULL);
 		if (session == NULL)
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@ -52,11 +52,9 @@ static int perf_session__list_build_ids(bool force, bool with_hits)
 {
 	struct perf_session *session;
 	struct perf_data data = {
-		.file      = {
-			.path = input_name,
-		},
-		.mode      = PERF_DATA_MODE_READ,
-		.force     = force,
+		.path  = input_name,
+		.mode  = PERF_DATA_MODE_READ,
+		.force = force,
 	};

 	symbol__elf_init();
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@ -2750,8 +2750,8 @@ static int perf_c2c__report(int argc, const char **argv)
 	if (!input_name || !strlen(input_name))
 		input_name = "perf.data";

-	data.file.path = input_name;
-	data.force     = symbol_conf.force;
+	data.path  = input_name;
+	data.force = symbol_conf.force;

 	err = setup_display(display);
 	if (err)
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@ -708,7 +708,7 @@ static void data__fprintf(void)

 	data__for_each_file(i, d)
 		fprintf(stdout, "#  [%d] %s %s\n",
-			d->idx, d->data.file.path,
+			d->idx, d->data.path,
 			!d->idx ? "(Baseline)" : "");

 	fprintf(stdout, "#\n");
@ -779,14 +779,14 @@ static int __cmd_diff(void)
 	data__for_each_file(i, d) {
 		d->session = perf_session__new(&d->data, false, &tool);
 		if (!d->session) {
-			pr_err("Failed to open %s\n", d->data.file.path);
+			pr_err("Failed to open %s\n", d->data.path);
 			ret = -1;
 			goto out_delete;
 		}

 		ret = perf_session__process_events(d->session);
 		if (ret) {
-			pr_err("Failed to process %s\n", d->data.file.path);
+			pr_err("Failed to process %s\n", d->data.path);
 			goto out_delete;
 		}

@ -1289,9 +1289,9 @@ static int data_init(int argc, const char **argv)
 	data__for_each_file(i, d) {
 		struct perf_data *data = &d->data;

-		data->file.path = use_default ? defaults[i] : argv[i];
-		data->mode      = PERF_DATA_MODE_READ,
-		data->force     = force,
+		data->path  = use_default ? defaults[i] : argv[i];
+		data->mode  = PERF_DATA_MODE_READ,
+		data->force = force,

 		d->idx  = i;
 	}
--- a/tools/perf/builtin-evlist.c
+++ b/tools/perf/builtin-evlist.c
@ -23,9 +23,7 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details
 	struct perf_session *session;
 	struct perf_evsel *pos;
 	struct perf_data data = {
-		.file      = {
-			.path = file_name,
-		},
+		.path      = file_name,
 		.mode      = PERF_DATA_MODE_READ,
 		.force     = details->force,
 	};
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@ -770,10 +770,8 @@ int cmd_inject(int argc, const char **argv)
 		.input_name  = "-",
 		.samples = LIST_HEAD_INIT(inject.samples),
 		.output = {
-			.file      = {
-				.path = "-",
-			},
-			.mode      = PERF_DATA_MODE_WRITE,
+			.path = "-",
+			.mode = PERF_DATA_MODE_WRITE,
 		},
 	};
 	struct perf_data data = {
@ -786,7 +784,7 @@ int cmd_inject(int argc, const char **argv)
 			    "Inject build-ids into the output stream"),
 		OPT_STRING('i', "input", &inject.input_name, "file",
 			   "input file name"),
-		OPT_STRING('o', "output", &inject.output.file.path, "file",
+		OPT_STRING('o', "output", &inject.output.path, "file",
 			   "output file name"),
 		OPT_BOOLEAN('s', "sched-stat", &inject.sched_stat,
 			    "Merge sched-stat and sched-switch for getting events "
@ -834,7 +832,7 @@ int cmd_inject(int argc, const char **argv)

 	inject.tool.ordered_events = inject.sched_stat;

-	data.file.path = inject.input_name;
+	data.path = inject.input_name;
 	inject.session = perf_session__new(&data, true, &inject.tool);
 	if (inject.session == NULL)
 		return -1;
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@ -1949,7 +1949,7 @@ int cmd_kmem(int argc, const char **argv)
 		return __cmd_record(argc, argv);
 	}

-	data.file.path = input_name;
+	data.path = input_name;

 	kmem_session = session = perf_session__new(&data, false, &perf_kmem);
 	if (session == NULL)
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@ -1080,11 +1080,9 @@ static int read_events(struct perf_kvm_stat *kvm)
 		.ordered_events		= true,
 	};
 	struct perf_data file = {
-		.file      = {
-			.path = kvm->file_name,
-		},
-		.mode      = PERF_DATA_MODE_READ,
-		.force     = kvm->force,
+		.path  = kvm->file_name,
+		.mode  = PERF_DATA_MODE_READ,
+		.force = kvm->force,
 	};

 	kvm->tool = eops;
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@ -866,11 +866,9 @@ static int __cmd_report(bool display_info)
 		.ordered_events	 = true,
 	};
 	struct perf_data data = {
-		.file      = {
-			.path = input_name,
-		},
-		.mode      = PERF_DATA_MODE_READ,
-		.force     = force,
+		.path  = input_name,
+		.mode  = PERF_DATA_MODE_READ,
+		.force = force,
 	};

 	session = perf_session__new(&data, false, &eops);
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@ -239,11 +239,9 @@ static int process_sample_event(struct perf_tool *tool,
 static int report_raw_events(struct perf_mem *mem)
 {
 	struct perf_data data = {
-		.file      = {
-			.path = input_name,
-		},
-		.mode      = PERF_DATA_MODE_READ,
-		.force     = mem->force,
+		.path  = input_name,
+		.mode  = PERF_DATA_MODE_READ,
+		.force = mem->force,
 	};
 	int ret;
 	struct perf_session *session = perf_session__new(&data, false,
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@ -660,10 +660,9 @@ static int process_sample_event(struct perf_tool *tool,

 static int process_buildids(struct record *rec)
 {
-	struct perf_data *data = &rec->data;
 	struct perf_session *session = rec->session;

-	if (data->size == 0)
+	if (perf_data__size(&rec->data) == 0)
 		return 0;

 	/*
@ -851,7 +850,7 @@ record__finish_output(struct record *rec)
 		return;

 	rec->session->header.data_size += rec->bytes_written;
-	data->size = lseek(perf_data__fd(data), 0, SEEK_CUR);
+	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);

 	if (!rec->no_buildid) {
 		process_buildids(rec);
@ -919,7 +918,7 @@ record__switch_output(struct record *rec, bool at_exit)

 	if (!quiet)
 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
-			data->file.path, timestamp);
+			data->path, timestamp);

 	/* Output tracking events */
 	if (!at_exit) {
@ -1462,7 +1461,7 @@ out_child:

 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s ]\n",
 			perf_data__size(data) / 1024.0 / 1024.0,
-			data->file.path, postfix, samples);
+			data->path, postfix, samples);
 	}

 out_delete_session:
@ -1863,7 +1862,7 @@ static struct option __record_options[] = {
 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
 		    "list of cpus to monitor"),
 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
-	OPT_STRING('o', "output", &record.data.file.path, "file",
+	OPT_STRING('o', "output", &record.data.path, "file",
 		    "output file name"),
 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
 			&record.opts.no_inherit_set,
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@ -899,7 +899,7 @@ static int __cmd_report(struct report *rep)
 		rep->nr_entries += evsel__hists(pos)->nr_entries;

 	if (rep->nr_entries == 0) {
-		ui__error("The %s file has no samples!\n", data->file.path);
+		ui__error("The %s data has no samples!\n", data->path);
 		return 0;
 	}

@ -1207,8 +1207,8 @@ int cmd_report(int argc, const char **argv)
 			input_name = "perf.data";
 	}

-	data.file.path = input_name;
-	data.force     = symbol_conf.force;
+	data.path  = input_name;
+	data.force = symbol_conf.force;

 repeat:
 	session = perf_session__new(&data, false, &report.tool);
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@ -1785,11 +1785,9 @@ static int perf_sched__read_events(struct perf_sched *sched)
 	};
 	struct perf_session *session;
 	struct perf_data data = {
-		.file      = {
-			.path = input_name,
-		},
-		.mode      = PERF_DATA_MODE_READ,
-		.force     = sched->force,
+		.path  = input_name,
+		.mode  = PERF_DATA_MODE_READ,
+		.force = sched->force,
 	};
 	int rc = -1;

@ -2958,11 +2956,9 @@ static int perf_sched__timehist(struct perf_sched *sched)
 		{ "sched:sched_migrate_task", timehist_migrate_task_event, },
 	};
 	struct perf_data data = {
-		.file      = {
-			.path = input_name,
-		},
-		.mode      = PERF_DATA_MODE_READ,
-		.force     = sched->force,
+		.path  = input_name,
+		.mode  = PERF_DATA_MODE_READ,
+		.force = sched->force,
 	};

 	struct perf_session *session;
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@ -149,6 +149,7 @@ static struct {
 	unsigned int print_ip_opts;
 	u64 fields;
 	u64 invalid_fields;
+	u64 user_set_fields;
 } output[OUTPUT_TYPE_MAX] = {

 	[PERF_TYPE_HARDWARE] = {
@ -345,7 +346,7 @@ static int perf_evsel__do_check_stype(struct perf_evsel *evsel,
 	if (attr->sample_type & sample_type)
 		return 0;

-	if (output[type].user_set) {
+	if (output[type].user_set_fields & field) {
 		if (allow_user_set)
 			return 0;
 		evname = perf_evsel__name(evsel);
@ -2632,10 +2633,13 @@ parse:
 					pr_warning("\'%s\' not valid for %s events. Ignoring.\n",
 						   all_output_options[i].str, event_type(j));
 				} else {
-					if (change == REMOVE)
+					if (change == REMOVE) {
 						output[j].fields &= ~all_output_options[i].field;
-					else
+						output[j].user_set_fields &= ~all_output_options[i].field;
+					} else {
 						output[j].fields |= all_output_options[i].field;
+						output[j].user_set_fields |= all_output_options[i].field;
+					}
 					output[j].user_set = true;
 					output[j].wildcard_set = true;
 				}
@ -2951,10 +2955,8 @@ int find_scripts(char **scripts_array, char **scripts_path_array)
 	DIR *scripts_dir, *lang_dir;
 	struct perf_session *session;
 	struct perf_data data = {
-		.file      = {
-			.path = input_name,
-		},
-		.mode      = PERF_DATA_MODE_READ,
+		.path = input_name,
+		.mode = PERF_DATA_MODE_READ,
 	};
 	char *temp;
 	int i = 0;
@ -3427,8 +3429,8 @@ int cmd_script(int argc, const char **argv)
 	argc = parse_options_subcommand(argc, argv, options, script_subcommands, script_usage,
 			     PARSE_OPT_STOP_AT_NON_OPTION);

-	data.file.path = input_name;
-	data.force     = symbol_conf.force;
+	data.path  = input_name;
+	data.force = symbol_conf.force;

 	if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) {
 		rec_script_path = get_script_path(argv[1], RECORD_SUFFIX);
@ -3654,7 +3656,7 @@ int cmd_script(int argc, const char **argv)
 			goto out_delete;
 		}

-		input = open(data.file.path, O_RDONLY);	/* input_name */
+		input = open(data.path, O_RDONLY);	/* input_name */
 		if (input < 0) {
 			err = -errno;
 			perror("failed to open file");
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@ -1322,7 +1322,7 @@ static int __cmd_record(int argc, const char **argv)
 			     PARSE_OPT_STOP_AT_NON_OPTION);

 	if (output_name)
-		data->file.path = output_name;
+		data->path = output_name;

 	if (stat_config.run_count != 1 || forever) {
 		pr_err("Cannot use -r option with perf stat record.\n");
@ -1523,8 +1523,8 @@ static int __cmd_report(int argc, const char **argv)
 			input_name = "perf.data";
 	}

-	perf_stat.data.file.path = input_name;
-	perf_stat.data.mode      = PERF_DATA_MODE_READ;
+	perf_stat.data.path = input_name;
+	perf_stat.data.mode = PERF_DATA_MODE_READ;

 	session = perf_session__new(&perf_stat.data, false, &perf_stat.tool);
 	if (session == NULL)
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@ -1602,11 +1602,9 @@ static int __cmd_timechart(struct timechart *tchart, const char *output_name)
 		{ "syscalls:sys_exit_select",		process_exit_poll },
 	};
 	struct perf_data data = {
-		.file      = {
-			.path = input_name,
-		},
-		.mode      = PERF_DATA_MODE_READ,
-		.force     = tchart->force,
+		.path  = input_name,
+		.mode  = PERF_DATA_MODE_READ,
+		.force = tchart->force,
 	};

 	struct perf_session *session = perf_session__new(&data, false,
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@ -3154,11 +3154,9 @@ static int trace__replay(struct trace *trace)
 		{ "probe:vfs_getname",	     trace__vfs_getname, },
 	};
 	struct perf_data data = {
-		.file      = {
-			.path = input_name,
-		},
-		.mode      = PERF_DATA_MODE_READ,
-		.force     = trace->force,
+		.path  = input_name,
+		.mode  = PERF_DATA_MODE_READ,
+		.force = trace->force,
 	};
 	struct perf_session *session;
 	struct perf_evsel *evsel;
--- a/tools/perf/scripts/python/exported-sql-viewer.py
+++ b/tools/perf/scripts/python/exported-sql-viewer.py
@ -1,3 +1,4 @@
+#!/usr/bin/env python2
 # SPDX-License-Identifier: GPL-2.0
 # exported-sql-viewer.py: view data from sql database
 # Copyright (c) 2014-2018, Intel Corporation.
@ -1397,18 +1398,28 @@ class BranchModel(TreeModel):
 	def HasMoreRecords(self):
 		return self.more

+# Report Variables
+
+class ReportVars():
+
+	def __init__(self, name = "", where_clause = "", limit = ""):
+		self.name = name
+		self.where_clause = where_clause
+		self.limit = limit
+
+	def UniqueId(self):
+		return str(self.where_clause + ";" + self.limit)
+
 # Branch window

 class BranchWindow(QMdiSubWindow):

-	def __init__(self, glb, event_id, name, where_clause, parent=None):
+	def __init__(self, glb, event_id, report_vars, parent=None):
 		super(BranchWindow, self).__init__(parent)

-		model_name = "Branch Events " + str(event_id)
-		if len(where_clause):
-			model_name = where_clause + " " + model_name
+		model_name = "Branch Events " + str(event_id) +  " " + report_vars.UniqueId()

-		self.model = LookupCreateModel(model_name, lambda: BranchModel(glb, event_id, where_clause))
+		self.model = LookupCreateModel(model_name, lambda: BranchModel(glb, event_id, report_vars.where_clause))

 		self.view = QTreeView()
 		self.view.setUniformRowHeights(True)
@ -1426,7 +1437,7 @@ class BranchWindow(QMdiSubWindow):

 		self.setWidget(self.vbox.Widget())

-		AddSubWindow(glb.mainwindow.mdi_area, self, name + " Branch Events")
+		AddSubWindow(glb.mainwindow.mdi_area, self, report_vars.name + " Branch Events")

 	def ResizeColumnToContents(self, column, n):
 		# Using the view's resizeColumnToContents() here is extrememly slow
@ -1471,47 +1482,134 @@ class BranchWindow(QMdiSubWindow):
 		else:
 			self.find_bar.NotFound()

-# Dialog data item converted and validated using a SQL table
+# Line edit data item

-class SQLTableDialogDataItem():
+class LineEditDataItem(object):

-	def __init__(self, glb, label, placeholder_text, table_name, match_column, column_name1, column_name2, parent):
+	def __init__(self, glb, label, placeholder_text, parent, id = "", default = ""):
 		self.glb = glb
 		self.label = label
 		self.placeholder_text = placeholder_text
-		self.table_name = table_name
-		self.match_column = match_column
-		self.column_name1 = column_name1
-		self.column_name2 = column_name2
 		self.parent = parent
+		self.id = id

-		self.value = ""
+		self.value = default

-		self.widget = QLineEdit()
+		self.widget = QLineEdit(default)
 		self.widget.editingFinished.connect(self.Validate)
 		self.widget.textChanged.connect(self.Invalidate)
 		self.red = False
 		self.error = ""
 		self.validated = True

-		self.last_id = 0
-		self.first_time = 0
-		self.last_time = 2 ** 64
-		if self.table_name == "<timeranges>":
-			query = QSqlQuery(self.glb.db)
-			QueryExec(query, "SELECT id, time FROM samples ORDER BY id DESC LIMIT 1")
-			if query.next():
-				self.last_id = int(query.value(0))
-				self.last_time = int(query.value(1))
-			QueryExec(query, "SELECT time FROM samples WHERE time != 0 ORDER BY id LIMIT 1")
-			if query.next():
-				self.first_time = int(query.value(0))
-			if placeholder_text:
-				placeholder_text += ", between " + str(self.first_time) + " and " + str(self.last_time)
-
 		if placeholder_text:
 			self.widget.setPlaceholderText(placeholder_text)

+	def TurnTextRed(self):
+		if not self.red:
+			palette = QPalette()
+			palette.setColor(QPalette.Text,Qt.red)
+			self.widget.setPalette(palette)
+			self.red = True
+
+	def TurnTextNormal(self):
+		if self.red:
+			palette = QPalette()
+			self.widget.setPalette(palette)
+			self.red = False
+
+	def InvalidValue(self, value):
+		self.value = ""
+		self.TurnTextRed()
+		self.error = self.label + " invalid value '" + value + "'"
+		self.parent.ShowMessage(self.error)
+
+	def Invalidate(self):
+		self.validated = False
+
+	def DoValidate(self, input_string):
+		self.value = input_string.strip()
+
+	def Validate(self):
+		self.validated = True
+		self.error = ""
+		self.TurnTextNormal()
+		self.parent.ClearMessage()
+		input_string = self.widget.text()
+		if not len(input_string.strip()):
+			self.value = ""
+			return
+		self.DoValidate(input_string)
+
+	def IsValid(self):
+		if not self.validated:
+			self.Validate()
+		if len(self.error):
+			self.parent.ShowMessage(self.error)
+			return False
+		return True
+
+	def IsNumber(self, value):
+		try:
+			x = int(value)
+		except:
+			x = 0
+		return str(x) == value
+
+# Non-negative integer ranges dialog data item
+
+class NonNegativeIntegerRangesDataItem(LineEditDataItem):
+
+	def __init__(self, glb, label, placeholder_text, column_name, parent):
+		super(NonNegativeIntegerRangesDataItem, self).__init__(glb, label, placeholder_text, parent)
+
+		self.column_name = column_name
+
+	def DoValidate(self, input_string):
+		singles = []
+		ranges = []
+		for value in [x.strip() for x in input_string.split(",")]:
+			if "-" in value:
+				vrange = value.split("-")
+				if len(vrange) != 2 or not self.IsNumber(vrange[0]) or not self.IsNumber(vrange[1]):
+					return self.InvalidValue(value)
+				ranges.append(vrange)
+			else:
+				if not self.IsNumber(value):
+					return self.InvalidValue(value)
+				singles.append(value)
+		ranges = [("(" + self.column_name + " >= " + r[0] + " AND " + self.column_name + " <= " + r[1] + ")") for r in ranges]
+		if len(singles):
+			ranges.append(self.column_name + " IN (" + ",".join(singles) + ")")
+		self.value = " OR ".join(ranges)
+
+# Positive integer dialog data item
+
+class PositiveIntegerDataItem(LineEditDataItem):
+
+	def __init__(self, glb, label, placeholder_text, parent, id = "", default = ""):
+		super(PositiveIntegerDataItem, self).__init__(glb, label, placeholder_text, parent, id, default)
+
+	def DoValidate(self, input_string):
+		if not self.IsNumber(input_string.strip()):
+			return self.InvalidValue(input_string)
+		value = int(input_string.strip())
+		if value <= 0:
+			return self.InvalidValue(input_string)
+		self.value = str(value)
+
+# Dialog data item converted and validated using a SQL table
+
+class SQLTableDataItem(LineEditDataItem):
+
+	def __init__(self, glb, label, placeholder_text, table_name, match_column, column_name1, column_name2, parent):
+		super(SQLTableDataItem, self).__init__(glb, label, placeholder_text, parent)
+
+		self.table_name = table_name
+		self.match_column = match_column
+		self.column_name1 = column_name1
+		self.column_name2 = column_name2
+
 	def ValueToIds(self, value):
 		ids = []
 		query = QSqlQuery(self.glb.db)
@ -1522,6 +1620,42 @@ class SQLTableDialogDataItem():
 				ids.append(str(query.value(0)))
 		return ids

+	def DoValidate(self, input_string):
+		all_ids = []
+		for value in [x.strip() for x in input_string.split(",")]:
+			ids = self.ValueToIds(value)
+			if len(ids):
+				all_ids.extend(ids)
+			else:
+				return self.InvalidValue(value)
+		self.value = self.column_name1 + " IN (" + ",".join(all_ids) + ")"
+		if self.column_name2:
+			self.value = "( " + self.value + " OR " + self.column_name2 + " IN (" + ",".join(all_ids) + ") )"
+
+# Sample time ranges dialog data item converted and validated using 'samples' SQL table
+
+class SampleTimeRangesDataItem(LineEditDataItem):
+
+	def __init__(self, glb, label, placeholder_text, column_name, parent):
+		self.column_name = column_name
+
+		self.last_id = 0
+		self.first_time = 0
+		self.last_time = 2 ** 64
+
+		query = QSqlQuery(glb.db)
+		QueryExec(query, "SELECT id, time FROM samples ORDER BY id DESC LIMIT 1")
+		if query.next():
+			self.last_id = int(query.value(0))
+			self.last_time = int(query.value(1))
+		QueryExec(query, "SELECT time FROM samples WHERE time != 0 ORDER BY id LIMIT 1")
+		if query.next():
+			self.first_time = int(query.value(0))
+		if placeholder_text:
+			placeholder_text += ", between " + str(self.first_time) + " and " + str(self.last_time)
+
+		super(SampleTimeRangesDataItem, self).__init__(glb, label, placeholder_text, parent)
+
 	def IdBetween(self, query, lower_id, higher_id, order):
 		QueryExec(query, "SELECT id FROM samples WHERE id > " + str(lower_id) + " AND id < " + str(higher_id) + " ORDER BY id " + order + " LIMIT 1")
 		if query.next():
@ -1559,7 +1693,6 @@ class SQLTableDialogDataItem():
 					return str(lower_id)

 	def ConvertRelativeTime(self, val):
-		print "val ", val
 		mult = 1
 		suffix = val[-2:]
 		if suffix == "ms":
@ -1581,29 +1714,23 @@ class SQLTableDialogDataItem():
 		return str(val)

 	def ConvertTimeRange(self, vrange):
-		print "vrange ", vrange
 		if vrange[0] == "":
 			vrange[0] = str(self.first_time)
 		if vrange[1] == "":
 			vrange[1] = str(self.last_time)
 		vrange[0] = self.ConvertRelativeTime(vrange[0])
 		vrange[1] = self.ConvertRelativeTime(vrange[1])
-		print "vrange2 ", vrange
 		if not self.IsNumber(vrange[0]) or not self.IsNumber(vrange[1]):
 			return False
-		print "ok1"
 		beg_range = max(int(vrange[0]), self.first_time)
 		end_range = min(int(vrange[1]), self.last_time)
 		if beg_range > self.last_time or end_range < self.first_time:
 			return False
-		print "ok2"
 		vrange[0] = self.BinarySearchTime(0, self.last_id, beg_range, True)
 		vrange[1] = self.BinarySearchTime(1, self.last_id + 1, end_range, False)
-		print "vrange3 ", vrange
 		return True

 	def AddTimeRange(self, value, ranges):
-		print "value ", value
 		n = value.count("-")
 		if n == 1:
 			pass
@ -1621,111 +1748,31 @@ class SQLTableDialogDataItem():
 			return True
 		return False

-	def InvalidValue(self, value):
-		self.value = ""
-		palette = QPalette()
-		palette.setColor(QPalette.Text,Qt.red)
-		self.widget.setPalette(palette)
-		self.red = True
-		self.error = self.label + " invalid value '" + value + "'"
-		self.parent.ShowMessage(self.error)
+	def DoValidate(self, input_string):
+		ranges = []
+		for value in [x.strip() for x in input_string.split(",")]:
+			if not self.AddTimeRange(value, ranges):
+				return self.InvalidValue(value)
+		ranges = [("(" + self.column_name + " >= " + r[0] + " AND " + self.column_name + " <= " + r[1] + ")") for r in ranges]
+		self.value = " OR ".join(ranges)

-	def IsNumber(self, value):
-		try:
-			x = int(value)
-		except:
-			x = 0
-		return str(x) == value
+# Report Dialog Base

-	def Invalidate(self):
-		self.validated = False
+class ReportDialogBase(QDialog):

-	def Validate(self):
-		input_string = self.widget.text()
-		self.validated = True
-		if self.red:
-			palette = QPalette()
-			self.widget.setPalette(palette)
-			self.red = False
-		if not len(input_string.strip()):
-			self.error = ""
-			self.value = ""
-			return
-		if self.table_name == "<timeranges>":
-			ranges = []
-			for value in [x.strip() for x in input_string.split(",")]:
-				if not self.AddTimeRange(value, ranges):
-					return self.InvalidValue(value)
-			ranges = [("(" + self.column_name1 + " >= " + r[0] + " AND " + self.column_name1 + " <= " + r[1] + ")") for r in ranges]
-			self.value = " OR ".join(ranges)
-		elif self.table_name == "<ranges>":
-			singles = []
-			ranges = []
-			for value in [x.strip() for x in input_string.split(",")]:
-				if "-" in value:
-					vrange = value.split("-")
-					if len(vrange) != 2 or not self.IsNumber(vrange[0]) or not self.IsNumber(vrange[1]):
-						return self.InvalidValue(value)
-					ranges.append(vrange)
-				else:
-					if not self.IsNumber(value):
-						return self.InvalidValue(value)
-					singles.append(value)
-			ranges = [("(" + self.column_name1 + " >= " + r[0] + " AND " + self.column_name1 + " <= " + r[1] + ")") for r in ranges]
-			if len(singles):
-				ranges.append(self.column_name1 + " IN (" + ",".join(singles) + ")")
-			self.value = " OR ".join(ranges)
-		elif self.table_name:
-			all_ids = []
-			for value in [x.strip() for x in input_string.split(",")]:
-				ids = self.ValueToIds(value)
-				if len(ids):
-					all_ids.extend(ids)
-				else:
-					return self.InvalidValue(value)
-			self.value = self.column_name1 + " IN (" + ",".join(all_ids) + ")"
-			if self.column_name2:
-				self.value = "( " + self.value + " OR " + self.column_name2 + " IN (" + ",".join(all_ids) + ") )"
-		else:
-			self.value = input_string.strip()
-		self.error = ""
-		self.parent.ClearMessage()
-
-	def IsValid(self):
-		if not self.validated:
-			self.Validate()
-		if len(self.error):
-			self.parent.ShowMessage(self.error)
-			return False
-		return True
-
-# Selected branch report creation dialog
-
-class SelectedBranchDialog(QDialog):
-
-	def __init__(self, glb, parent=None):
-		super(SelectedBranchDialog, self).__init__(parent)
+	def __init__(self, glb, title, items, partial, parent=None):
+		super(ReportDialogBase, self).__init__(parent)

 		self.glb = glb

-		self.name = ""
-		self.where_clause = ""
+		self.report_vars = ReportVars()

-		self.setWindowTitle("Selected Branches")
+		self.setWindowTitle(title)
 		self.setMinimumWidth(600)

-		items = (
-			("Report name:", "Enter a name to appear in the window title bar", "", "", "", ""),
-			("Time ranges:", "Enter time ranges", "<timeranges>", "", "samples.id", ""),
-			("CPUs:", "Enter CPUs or ranges e.g. 0,5-6", "<ranges>", "", "cpu", ""),
-			("Commands:", "Only branches with these commands will be included", "comms", "comm", "comm_id", ""),
-			("PIDs:", "Only branches with these process IDs will be included", "threads", "pid", "thread_id", ""),
-			("TIDs:", "Only branches with these thread IDs will be included", "threads", "tid", "thread_id", ""),
-			("DSOs:", "Only branches with these DSOs will be included", "dsos", "short_name", "samples.dso_id", "to_dso_id"),
-			("Symbols:", "Only branches with these symbols will be included", "symbols", "name", "symbol_id", "to_symbol_id"),
-			("Raw SQL clause: ", "Enter a raw SQL WHERE clause", "", "", "", ""),
-			)
-		self.data_items = [SQLTableDialogDataItem(glb, *x, parent=self) for x in items]
+		self.data_items = [x(glb, self) for x in items]
+
+		self.partial = partial

 		self.grid = QGridLayout()

@ -1757,23 +1804,28 @@ class SelectedBranchDialog(QDialog):
 		self.setLayout(self.vbox);

 	def Ok(self):
-		self.name = self.data_items[0].value
-		if not self.name:
+		vars = self.report_vars
+		for d in self.data_items:
+			if d.id == "REPORTNAME":
+				vars.name = d.value
+		if not vars.name:
 			self.ShowMessage("Report name is required")
 			return
 		for d in self.data_items:
 			if not d.IsValid():
 				return
 		for d in self.data_items[1:]:
-			if len(d.value):
-				if len(self.where_clause):
-					self.where_clause += " AND "
-				self.where_clause += d.value
-		if len(self.where_clause):
-			self.where_clause = " AND ( " + self.where_clause + " ) "
-		else:
-			self.ShowMessage("No selection")
-			return
+			if d.id == "LIMIT":
+				vars.limit = d.value
+			elif len(d.value):
+				if len(vars.where_clause):
+					vars.where_clause += " AND "
+				vars.where_clause += d.value
+		if len(vars.where_clause):
+			if self.partial:
+				vars.where_clause = " AND ( " + vars.where_clause + " ) "
+			else:
+				vars.where_clause = " WHERE " + vars.where_clause + " "
 		self.accept()

 	def ShowMessage(self, msg):
@ -1782,6 +1834,23 @@ class SelectedBranchDialog(QDialog):
 	def ClearMessage(self):
 		self.status.setText("")

+# Selected branch report creation dialog
+
+class SelectedBranchDialog(ReportDialogBase):
+
+	def __init__(self, glb, parent=None):
+		title = "Selected Branches"
+		items = (lambda g, p: LineEditDataItem(g, "Report name:", "Enter a name to appear in the window title bar", p, "REPORTNAME"),
+			 lambda g, p: SampleTimeRangesDataItem(g, "Time ranges:", "Enter time ranges", "samples.id", p),
+			 lambda g, p: NonNegativeIntegerRangesDataItem(g, "CPUs:", "Enter CPUs or ranges e.g. 0,5-6", "cpu", p),
+			 lambda g, p: SQLTableDataItem(g, "Commands:", "Only branches with these commands will be included", "comms", "comm", "comm_id", "", p),
+			 lambda g, p: SQLTableDataItem(g, "PIDs:", "Only branches with these process IDs will be included", "threads", "pid", "thread_id", "", p),
+			 lambda g, p: SQLTableDataItem(g, "TIDs:", "Only branches with these thread IDs will be included", "threads", "tid", "thread_id", "", p),
+			 lambda g, p: SQLTableDataItem(g, "DSOs:", "Only branches with these DSOs will be included", "dsos", "short_name", "samples.dso_id", "to_dso_id", p),
+			 lambda g, p: SQLTableDataItem(g, "Symbols:", "Only branches with these symbols will be included", "symbols", "name", "symbol_id", "to_symbol_id", p),
+			 lambda g, p: LineEditDataItem(g, "Raw SQL clause: ", "Enter a raw SQL WHERE clause", p))
+		super(SelectedBranchDialog, self).__init__(glb, title, items, True, parent)
+
 # Event list

 def GetEventList(db):
@ -1792,6 +1861,16 @@ def GetEventList(db):
 		events.append(query.value(0))
 	return events

+# Is a table selectable
+
+def IsSelectable(db, table):
+	query = QSqlQuery(db)
+	try:
+		QueryExec(query, "SELECT * FROM " + table + " LIMIT 1")
+	except:
+		return False
+	return True
+
 # SQL data preparation

 def SQLTableDataPrep(query, count):
@ -1817,12 +1896,13 @@ class SQLTableModel(TableModel):

 	progress = Signal(object)

-	def __init__(self, glb, sql, column_count, parent=None):
+	def __init__(self, glb, sql, column_headers, parent=None):
 		super(SQLTableModel, self).__init__(parent)
 		self.glb = glb
 		self.more = True
 		self.populated = 0
-		self.fetcher = SQLFetcher(glb, sql, lambda x, y=column_count: SQLTableDataPrep(x, y), self.AddSample)
+		self.column_headers = column_headers
+		self.fetcher = SQLFetcher(glb, sql, lambda x, y=len(column_headers): SQLTableDataPrep(x, y), self.AddSample)
 		self.fetcher.done.connect(self.Update)
 		self.fetcher.Fetch(glb_chunk_sz)

@ -1860,6 +1940,12 @@ class SQLTableModel(TableModel):
 	def HasMoreRecords(self):
 		return self.more

+	def columnCount(self, parent=None):
+		return len(self.column_headers)
+
+	def columnHeader(self, column):
+		return self.column_headers[column]
+
 # SQL automatic table data model

 class SQLAutoTableModel(SQLTableModel):
@ -1869,12 +1955,12 @@ class SQLAutoTableModel(SQLTableModel):
 		if table_name == "comm_threads_view":
 			# For now, comm_threads_view has no id column
 			sql = "SELECT * FROM " + table_name + " WHERE comm_id > $$last_id$$ ORDER BY comm_id LIMIT " + str(glb_chunk_sz)
-		self.column_headers = []
+		column_headers = []
 		query = QSqlQuery(glb.db)
 		if glb.dbref.is_sqlite3:
 			QueryExec(query, "PRAGMA table_info(" + table_name + ")")
 			while query.next():
-				self.column_headers.append(query.value(1))
+				column_headers.append(query.value(1))
 			if table_name == "sqlite_master":
 				sql = "SELECT * FROM " + table_name
 		else:
@ -1887,14 +1973,8 @@ class SQLAutoTableModel(SQLTableModel):
 				schema = "public"
 			QueryExec(query, "SELECT column_name FROM information_schema.columns WHERE table_schema = '" + schema + "' and table_name = '" + select_table_name + "'")
 			while query.next():
-				self.column_headers.append(query.value(0))
-		super(SQLAutoTableModel, self).__init__(glb, sql, len(self.column_headers), parent)
-
-	def columnCount(self, parent=None):
-		return len(self.column_headers)
-
-	def columnHeader(self, column):
-		return self.column_headers[column]
+				column_headers.append(query.value(0))
+		super(SQLAutoTableModel, self).__init__(glb, sql, column_headers, parent)

 # Base class for custom ResizeColumnsToContents

@ -1997,6 +2077,103 @@ def GetTableList(glb):
 		tables.append("information_schema.columns")
 	return tables

+# Top Calls data model
+
+class TopCallsModel(SQLTableModel):
+
+	def __init__(self, glb, report_vars, parent=None):
+		text = ""
+		if not glb.dbref.is_sqlite3:
+			text = "::text"
+		limit = ""
+		if len(report_vars.limit):
+			limit = " LIMIT " + report_vars.limit
+		sql = ("SELECT comm, pid, tid, name,"
+			" CASE"
+			" WHEN (short_name = '[kernel.kallsyms]') THEN '[kernel]'" + text +
+			" ELSE short_name"
+			" END AS dso,"
+			" call_time, return_time, (return_time - call_time) AS elapsed_time, branch_count, "
+			" CASE"
+			" WHEN (calls.flags = 1) THEN 'no call'" + text +
+			" WHEN (calls.flags = 2) THEN 'no return'" + text +
+			" WHEN (calls.flags = 3) THEN 'no call/return'" + text +
+			" ELSE ''" + text +
+			" END AS flags"
+			" FROM calls"
+			" INNER JOIN call_paths ON calls.call_path_id = call_paths.id"
+			" INNER JOIN symbols ON call_paths.symbol_id = symbols.id"
+			" INNER JOIN dsos ON symbols.dso_id = dsos.id"
+			" INNER JOIN comms ON calls.comm_id = comms.id"
+			" INNER JOIN threads ON calls.thread_id = threads.id" +
+			report_vars.where_clause +
+			" ORDER BY elapsed_time DESC" +
+			limit
+			)
+		column_headers = ("Command", "PID", "TID", "Symbol", "Object", "Call Time", "Return Time", "Elapsed Time (ns)", "Branch Count", "Flags")
+		self.alignment = (Qt.AlignLeft, Qt.AlignLeft, Qt.AlignLeft, Qt.AlignLeft, Qt.AlignLeft, Qt.AlignLeft, Qt.AlignLeft, Qt.AlignRight, Qt.AlignRight, Qt.AlignLeft)
+		super(TopCallsModel, self).__init__(glb, sql, column_headers, parent)
+
+	def columnAlignment(self, column):
+		return self.alignment[column]
+
+# Top Calls report creation dialog
+
+class TopCallsDialog(ReportDialogBase):
+
+	def __init__(self, glb, parent=None):
+		title = "Top Calls by Elapsed Time"
+		items = (lambda g, p: LineEditDataItem(g, "Report name:", "Enter a name to appear in the window title bar", p, "REPORTNAME"),
+			 lambda g, p: SQLTableDataItem(g, "Commands:", "Only calls with these commands will be included", "comms", "comm", "comm_id", "", p),
+			 lambda g, p: SQLTableDataItem(g, "PIDs:", "Only calls with these process IDs will be included", "threads", "pid", "thread_id", "", p),
+			 lambda g, p: SQLTableDataItem(g, "TIDs:", "Only calls with these thread IDs will be included", "threads", "tid", "thread_id", "", p),
+			 lambda g, p: SQLTableDataItem(g, "DSOs:", "Only calls with these DSOs will be included", "dsos", "short_name", "dso_id", "", p),
+			 lambda g, p: SQLTableDataItem(g, "Symbols:", "Only calls with these symbols will be included", "symbols", "name", "symbol_id", "", p),
+			 lambda g, p: LineEditDataItem(g, "Raw SQL clause: ", "Enter a raw SQL WHERE clause", p),
+			 lambda g, p: PositiveIntegerDataItem(g, "Record limit:", "Limit selection to this number of records", p, "LIMIT", "100"))
+		super(TopCallsDialog, self).__init__(glb, title, items, False, parent)
+
+# Top Calls window
+
+class TopCallsWindow(QMdiSubWindow, ResizeColumnsToContentsBase):
+
+	def __init__(self, glb, report_vars, parent=None):
+		super(TopCallsWindow, self).__init__(parent)
+
+		self.data_model = LookupCreateModel("Top Calls " + report_vars.UniqueId(), lambda: TopCallsModel(glb, report_vars))
+		self.model = self.data_model
+
+		self.view = QTableView()
+		self.view.setModel(self.model)
+		self.view.setEditTriggers(QAbstractItemView.NoEditTriggers)
+		self.view.verticalHeader().setVisible(False)
+
+		self.ResizeColumnsToContents()
+
+		self.find_bar = FindBar(self, self, True)
+
+		self.finder = ChildDataItemFinder(self.model)
+
+		self.fetch_bar = FetchMoreRecordsBar(self.data_model, self)
+
+		self.vbox = VBox(self.view, self.find_bar.Widget(), self.fetch_bar.Widget())
+
+		self.setWidget(self.vbox.Widget())
+
+		AddSubWindow(glb.mainwindow.mdi_area, self, report_vars.name)
+
+	def Find(self, value, direction, pattern, context):
+		self.view.setFocus()
+		self.find_bar.Busy()
+		self.finder.Find(value, direction, pattern, context, self.FindDone)
+
+	def FindDone(self, row):
+		self.find_bar.Idle()
+		if row >= 0:
+			self.view.setCurrentIndex(self.model.index(row, 0, QModelIndex()))
+		else:
+			self.find_bar.NotFound()
+
 # Action Definition

 def CreateAction(label, tip, callback, parent=None, shortcut=None):
@ -2100,6 +2277,7 @@ p.c2 {
 <p class=c2><a href=#callgraph>1.1 Context-Sensitive Call Graph</a></p>
 <p class=c2><a href=#allbranches>1.2 All branches</a></p>
 <p class=c2><a href=#selectedbranches>1.3 Selected branches</a></p>
+<p class=c2><a href=#topcallsbyelapsedtime>1.4 Top calls by elapsed time</a></p>
 <p class=c1><a href=#tables>2. Tables</a></p>
 <h1 id=reports>1. Reports</h1>
 <h2 id=callgraph>1.1 Context-Sensitive Call Graph</h2>
@ -2175,6 +2353,10 @@ ms, us or ns. Also, negative values are relative to the end of trace.  Examples:
 	-10ms-			The last 10ms
 </pre>
 N.B. Due to the granularity of timestamps, there could be no branches in any given time range.
+<h2 id=topcallsbyelapsedtime>1.4 Top calls by elapsed time</h2>
+The Top calls by elapsed time report displays calls in descending order of time elapsed between when the function was called and when it returned.
+The data is reduced by various selection criteria. A dialog box displays available criteria which are AND'ed together.
+If not all data is fetched, a Fetch bar is provided. Ctrl-F displays a Find bar.
 <h1 id=tables>2. Tables</h1>
 The Tables menu shows all tables and views in the database. Most tables have an associated view
 which displays the information in a more friendly way. Not all data for large tables is fetched
@ -2304,10 +2486,14 @@ class MainWindow(QMainWindow):
 		edit_menu.addAction(CreateAction("&Enlarge Font", "Make text bigger", self.EnlargeFont, self, [QKeySequence("Ctrl++")]))

 		reports_menu = menu.addMenu("&Reports")
-		reports_menu.addAction(CreateAction("Context-Sensitive Call &Graph", "Create a new window containing a context-sensitive call graph", self.NewCallGraph, self))
+		if IsSelectable(glb.db, "calls"):
+			reports_menu.addAction(CreateAction("Context-Sensitive Call &Graph", "Create a new window containing a context-sensitive call graph", self.NewCallGraph, self))

 		self.EventMenu(GetEventList(glb.db), reports_menu)

+		if IsSelectable(glb.db, "calls"):
+			reports_menu.addAction(CreateAction("&Top calls by elapsed time", "Create a new window displaying top calls by elapsed time", self.NewTopCalls, self))
+
 		self.TableMenu(GetTableList(glb), menu)

 		self.window_menu = WindowMenu(self.mdi_area, menu)
@ -2363,14 +2549,20 @@ class MainWindow(QMainWindow):
 	def NewCallGraph(self):
 		CallGraphWindow(self.glb, self)

+	def NewTopCalls(self):
+		dialog = TopCallsDialog(self.glb, self)
+		ret = dialog.exec_()
+		if ret:
+			TopCallsWindow(self.glb, dialog.report_vars, self)
+
 	def NewBranchView(self, event_id):
-		BranchWindow(self.glb, event_id, "", "", self)
+		BranchWindow(self.glb, event_id, ReportVars(), self)

 	def NewSelectedBranchView(self, event_id):
 		dialog = SelectedBranchDialog(self.glb, self)
 		ret = dialog.exec_()
 		if ret:
-			BranchWindow(self.glb, event_id, dialog.name, dialog.where_clause, self)
+			BranchWindow(self.glb, event_id, dialog.report_vars, self)

 	def NewTableView(self, table_name):
 		TableWindow(self.glb, table_name, self)
--- a/tools/perf/scripts/python/failed-syscalls-by-pid.py
+++ b/tools/perf/scripts/python/failed-syscalls-by-pid.py
@ -5,6 +5,8 @@
 # Displays system-wide failed system call totals, broken down by pid.
 # If a [comm] arg is specified, only syscalls called by [comm] are displayed.

+from __future__ import print_function
+
 import os
 import sys

@ -32,7 +34,7 @@ if len(sys.argv) > 1:
 syscalls = autodict()

 def trace_begin():
-	print "Press control+C to stop and show the summary"
+	print("Press control+C to stop and show the summary")

 def trace_end():
 	print_error_totals()
@ -57,22 +59,21 @@ def syscalls__sys_exit(event_name, context, common_cpu,

 def print_error_totals():
    if for_comm is not None:
-	    print "\nsyscall errors for %s:\n\n" % (for_comm),
+	    print("\nsyscall errors for %s:\n" % (for_comm))
    else:
-	    print "\nsyscall errors:\n\n",
+	    print("\nsyscall errors:\n")

-    print "%-30s  %10s\n" % ("comm [pid]", "count"),
-    print "%-30s  %10s\n" % ("------------------------------", \
-                                 "----------"),
+    print("%-30s  %10s" % ("comm [pid]", "count"))
+    print("%-30s  %10s" % ("------------------------------", "----------"))

    comm_keys = syscalls.keys()
    for comm in comm_keys:
 	    pid_keys = syscalls[comm].keys()
 	    for pid in pid_keys:
-		    print "\n%s [%d]\n" % (comm, pid),
+		    print("\n%s [%d]" % (comm, pid))
 		    id_keys = syscalls[comm][pid].keys()
 		    for id in id_keys:
-			    print "  syscall: %-16s\n" % syscall_name(id),
+			    print("  syscall: %-16s" % syscall_name(id))
 			    ret_keys = syscalls[comm][pid][id].keys()
-			    for ret, val in sorted(syscalls[comm][pid][id].iteritems(), key = lambda(k, v): (v, k),  reverse = True):
-				    print "    err = %-20s  %10d\n" % (strerror(ret), val),
+			    for ret, val in sorted(syscalls[comm][pid][id].items(), key = lambda kv: (kv[1], kv[0]),  reverse = True):
+				    print("    err = %-20s  %10d" % (strerror(ret), val))
--- a/tools/perf/scripts/python/mem-phys-addr.py
+++ b/tools/perf/scripts/python/mem-phys-addr.py
@ -4,6 +4,8 @@
 # Copyright (c) 2018, Intel Corporation.

 from __future__ import division
+from __future__ import print_function
+
 import os
 import sys
 import struct
@ -31,21 +33,23 @@ def parse_iomem():
 	for i, j in enumerate(f):
 		m = re.split('-|:',j,2)
 		if m[2].strip() == 'System RAM':
-			system_ram.append(long(m[0], 16))
-			system_ram.append(long(m[1], 16))
+			system_ram.append(int(m[0], 16))
+			system_ram.append(int(m[1], 16))
 		if m[2].strip() == 'Persistent Memory':
-			pmem.append(long(m[0], 16))
-			pmem.append(long(m[1], 16))
+			pmem.append(int(m[0], 16))
+			pmem.append(int(m[1], 16))

 def print_memory_type():
-	print "Event: %s" % (event_name)
-	print "%-40s  %10s  %10s\n" % ("Memory type", "count", "percentage"),
-	print "%-40s  %10s  %10s\n" % ("----------------------------------------", \
+	print("Event: %s" % (event_name))
+	print("%-40s  %10s  %10s\n" % ("Memory type", "count", "percentage"), end='')
+	print("%-40s  %10s  %10s\n" % ("----------------------------------------",
 					"-----------", "-----------"),
+                                        end='');
 	total = sum(load_mem_type_cnt.values())
 	for mem_type, count in sorted(load_mem_type_cnt.most_common(), \
-					key = lambda(k, v): (v, k), reverse = True):
-		print "%-40s  %10d  %10.1f%%\n" % (mem_type, count, 100 * count / total),
+					key = lambda kv: (kv[1], kv[0]), reverse = True):
+		print("%-40s  %10d  %10.1f%%\n" % (mem_type, count, 100 * count / total),
+                        end='')

 def trace_begin():
 	parse_iomem()
@ -80,7 +84,7 @@ def find_memory_type(phys_addr):
 	f.seek(0, 0)
 	for j in f:
 		m = re.split('-|:',j,2)
-		if long(m[0], 16) <= phys_addr <= long(m[1], 16):
+		if int(m[0], 16) <= phys_addr <= int(m[1], 16):
 			return m[2]
 	return "N/A"

--- a/tools/perf/scripts/python/net_dropmonitor.py
+++ b/tools/perf/scripts/python/net_dropmonitor.py
@ -1,6 +1,8 @@
 # Monitor the system for dropped packets and proudce a report of drop locations and counts
 # SPDX-License-Identifier: GPL-2.0

+from __future__ import print_function
+
 import os
 import sys

@ -50,19 +52,19 @@ def get_sym(sloc):
 		return (None, 0)

 def print_drop_table():
-	print "%25s %25s %25s" % ("LOCATION", "OFFSET", "COUNT")
+	print("%25s %25s %25s" % ("LOCATION", "OFFSET", "COUNT"))
 	for i in drop_log.keys():
 		(sym, off) = get_sym(i)
 		if sym == None:
 			sym = i
-		print "%25s %25s %25s" % (sym, off, drop_log[i])
+		print("%25s %25s %25s" % (sym, off, drop_log[i]))


 def trace_begin():
-	print "Starting trace (Ctrl-C to dump results)"
+	print("Starting trace (Ctrl-C to dump results)")

 def trace_end():
-	print "Gathering kallsyms data"
+	print("Gathering kallsyms data")
 	get_kallsyms_table()
 	print_drop_table()

--- a/tools/perf/scripts/python/netdev-times.py
+++ b/tools/perf/scripts/python/netdev-times.py
@ -8,6 +8,8 @@
 # dev=: show only thing related to specified device
 # debug: work with debug mode. It shows buffer status.

+from __future__ import print_function
+
 import os
 import sys

@ -17,6 +19,7 @@ sys.path.append(os.environ['PERF_EXEC_PATH'] + \
 from perf_trace_context import *
 from Core import *
 from Util import *
+from functools import cmp_to_key

 all_event_list = []; # insert all tracepoint event related with this script
 irq_dic = {}; # key is cpu and value is a list which stacks irqs
@ -61,12 +64,12 @@ def diff_msec(src, dst):
 def print_transmit(hunk):
 	if dev != 0 and hunk['dev'].find(dev) < 0:
 		return
-	print "%7s %5d %6d.%06dsec %12.3fmsec      %12.3fmsec" % \
+	print("%7s %5d %6d.%06dsec %12.3fmsec      %12.3fmsec" %
 		(hunk['dev'], hunk['len'],
 		nsecs_secs(hunk['queue_t']),
 		nsecs_nsecs(hunk['queue_t'])/1000,
 		diff_msec(hunk['queue_t'], hunk['xmit_t']),
-		diff_msec(hunk['xmit_t'], hunk['free_t']))
+		diff_msec(hunk['xmit_t'], hunk['free_t'])))

 # Format for displaying rx packet processing
 PF_IRQ_ENTRY= "  irq_entry(+%.3fmsec irq=%d:%s)"
@ -98,55 +101,55 @@ def print_receive(hunk):
 	if show_hunk == 0:
 		return

-	print "%d.%06dsec cpu=%d" % \
-		(nsecs_secs(base_t), nsecs_nsecs(base_t)/1000, cpu)
+	print("%d.%06dsec cpu=%d" %
+		(nsecs_secs(base_t), nsecs_nsecs(base_t)/1000, cpu))
 	for i in range(len(irq_list)):
-		print PF_IRQ_ENTRY % \
+		print(PF_IRQ_ENTRY %
 			(diff_msec(base_t, irq_list[i]['irq_ent_t']),
-			irq_list[i]['irq'], irq_list[i]['name'])
-		print PF_JOINT
+			irq_list[i]['irq'], irq_list[i]['name']))
+		print(PF_JOINT)
 		irq_event_list = irq_list[i]['event_list']
 		for j in range(len(irq_event_list)):
 			irq_event = irq_event_list[j]
 			if irq_event['event'] == 'netif_rx':
-				print PF_NET_RX % \
+				print(PF_NET_RX %
 					(diff_msec(base_t, irq_event['time']),
-					irq_event['skbaddr'])
-				print PF_JOINT
-	print PF_SOFT_ENTRY % \
-		diff_msec(base_t, hunk['sirq_ent_t'])
-	print PF_JOINT
+					irq_event['skbaddr']))
+				print(PF_JOINT)
+	print(PF_SOFT_ENTRY %
+		diff_msec(base_t, hunk['sirq_ent_t']))
+	print(PF_JOINT)
 	event_list = hunk['event_list']
 	for i in range(len(event_list)):
 		event = event_list[i]
 		if event['event_name'] == 'napi_poll':
-			print PF_NAPI_POLL % \
-			    (diff_msec(base_t, event['event_t']), event['dev'])
+			print(PF_NAPI_POLL %
+			    (diff_msec(base_t, event['event_t']), event['dev']))
 			if i == len(event_list) - 1:
-				print ""
+				print("")
 			else:
-				print PF_JOINT
+				print(PF_JOINT)
 		else:
-			print PF_NET_RECV % \
+			print(PF_NET_RECV %
 			    (diff_msec(base_t, event['event_t']), event['skbaddr'],
-				event['len'])
+				event['len']))
 			if 'comm' in event.keys():
-				print PF_WJOINT
-				print PF_CPY_DGRAM % \
+				print(PF_WJOINT)
+				print(PF_CPY_DGRAM %
 					(diff_msec(base_t, event['comm_t']),
-					event['pid'], event['comm'])
+					event['pid'], event['comm']))
 			elif 'handle' in event.keys():
-				print PF_WJOINT
+				print(PF_WJOINT)
 				if event['handle'] == "kfree_skb":
-					print PF_KFREE_SKB % \
+					print(PF_KFREE_SKB %
 						(diff_msec(base_t,
 						event['comm_t']),
-						event['location'])
+						event['location']))
 				elif event['handle'] == "consume_skb":
-					print PF_CONS_SKB % \
+					print(PF_CONS_SKB %
 						diff_msec(base_t,
-							event['comm_t'])
-			print PF_JOINT
+							event['comm_t']))
+			print(PF_JOINT)

 def trace_begin():
 	global show_tx
@ -172,8 +175,7 @@ def trace_begin():

 def trace_end():
 	# order all events in time
-	all_event_list.sort(lambda a,b :cmp(a[EINFO_IDX_TIME],
-					    b[EINFO_IDX_TIME]))
+	all_event_list.sort(key=cmp_to_key(lambda a,b :a[EINFO_IDX_TIME] < b[EINFO_IDX_TIME]))
 	# process all events
 	for i in range(len(all_event_list)):
 		event_info = all_event_list[i]
@ -210,19 +212,19 @@ def trace_end():
 			print_receive(receive_hunk_list[i])
 	# display transmit hunks
 	if show_tx:
-		print "   dev    len      Qdisc        " \
-			"       netdevice             free"
+		print("   dev    len      Qdisc        "
+			"       netdevice             free")
 		for i in range(len(tx_free_list)):
 			print_transmit(tx_free_list[i])
 	if debug:
-		print "debug buffer status"
-		print "----------------------------"
-		print "xmit Qdisc:remain:%d overflow:%d" % \
-			(len(tx_queue_list), of_count_tx_queue_list)
-		print "xmit netdevice:remain:%d overflow:%d" % \
-			(len(tx_xmit_list), of_count_tx_xmit_list)
-		print "receive:remain:%d overflow:%d" % \
-			(len(rx_skb_list), of_count_rx_skb_list)
+		print("debug buffer status")
+		print("----------------------------")
+		print("xmit Qdisc:remain:%d overflow:%d" %
+			(len(tx_queue_list), of_count_tx_queue_list))
+		print("xmit netdevice:remain:%d overflow:%d" %
+			(len(tx_xmit_list), of_count_tx_xmit_list))
+		print("receive:remain:%d overflow:%d" %
+			(len(rx_skb_list), of_count_rx_skb_list))

 # called from perf, when it finds a correspoinding event
 def irq__softirq_entry(name, context, cpu, sec, nsec, pid, comm, callchain, vec):
--- a/tools/perf/scripts/python/powerpc-hcalls.py
+++ b/tools/perf/scripts/python/powerpc-hcalls.py
@ -4,6 +4,8 @@
 #
 # Hypervisor call statisics

+from __future__ import print_function
+
 import os
 import sys

@ -149,7 +151,7 @@ hcall_table = {
 }

 def hcall_table_lookup(opcode):
-	if (hcall_table.has_key(opcode)):
+	if (opcode in hcall_table):
 		return hcall_table[opcode]
 	else:
 		return opcode
@ -157,8 +159,8 @@ def hcall_table_lookup(opcode):
 print_ptrn = '%-28s%10s%10s%10s%10s'

 def trace_end():
-	print print_ptrn % ('hcall', 'count', 'min(ns)', 'max(ns)', 'avg(ns)')
-	print '-' * 68
+	print(print_ptrn % ('hcall', 'count', 'min(ns)', 'max(ns)', 'avg(ns)'))
+	print('-' * 68)
 	for opcode in output:
 		h_name = hcall_table_lookup(opcode)
 		time = output[opcode]['time']
@ -166,14 +168,14 @@ def trace_end():
 		min_t = output[opcode]['min']
 		max_t = output[opcode]['max']

-		print print_ptrn % (h_name, cnt, min_t, max_t, time/cnt)
+		print(print_ptrn % (h_name, cnt, min_t, max_t, time//cnt))

 def powerpc__hcall_exit(name, context, cpu, sec, nsec, pid, comm, callchain,
 			opcode, retval):
-	if (d_enter.has_key(cpu) and d_enter[cpu].has_key(opcode)):
+	if (cpu in d_enter and opcode in d_enter[cpu]):
 		diff = nsecs(sec, nsec) - d_enter[cpu][opcode]

-		if (output.has_key(opcode)):
+		if (opcode in output):
 			output[opcode]['time'] += diff
 			output[opcode]['cnt'] += 1
 			if (output[opcode]['min'] > diff):
@ -190,11 +192,11 @@ def powerpc__hcall_exit(name, context, cpu, sec, nsec, pid, comm, callchain,

 		del d_enter[cpu][opcode]
 #	else:
-#		print "Can't find matching hcall_enter event. Ignoring sample"
+#		print("Can't find matching hcall_enter event. Ignoring sample")

 def powerpc__hcall_entry(event_name, context, cpu, sec, nsec, pid, comm,
 			 callchain, opcode):
-		if (d_enter.has_key(cpu)):
+		if (cpu in d_enter):
 			d_enter[cpu][opcode] = nsecs(sec, nsec)
 		else:
 			d_enter[cpu] = {opcode: nsecs(sec, nsec)}
--- a/tools/perf/scripts/python/sctop.py
+++ b/tools/perf/scripts/python/sctop.py
@ -8,7 +8,14 @@
 # will be refreshed every [interval] seconds.  The default interval is
 # 3 seconds.

-import os, sys, thread, time
+from __future__ import print_function
+
+import os, sys, time
+
+try:
+        import thread
+except ImportError:
+        import _thread as thread

 sys.path.append(os.environ['PERF_EXEC_PATH'] + \
 	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
@ -62,18 +69,19 @@ def print_syscall_totals(interval):
 	while 1:
 		clear_term()
 		if for_comm is not None:
-			print "\nsyscall events for %s:\n\n" % (for_comm),
+			print("\nsyscall events for %s:\n" % (for_comm))
 		else:
-			print "\nsyscall events:\n\n",
+			print("\nsyscall events:\n")

-		print "%-40s  %10s\n" % ("event", "count"),
-		print "%-40s  %10s\n" % ("----------------------------------------", \
-						 "----------"),
+		print("%-40s  %10s" % ("event", "count"))
+		print("%-40s  %10s" %
+                        ("----------------------------------------",
+                        "----------"))

-		for id, val in sorted(syscalls.iteritems(), key = lambda(k, v): (v, k), \
+		for id, val in sorted(syscalls.items(), key = lambda kv: (kv[1], kv[0]), \
 					      reverse = True):
 			try:
-				print "%-40s  %10d\n" % (syscall_name(id), val),
+				print("%-40s  %10d" % (syscall_name(id), val))
 			except TypeError:
 				pass
 		syscalls.clear()
--- a/tools/perf/scripts/python/stackcollapse.py
+++ b/tools/perf/scripts/python/stackcollapse.py
@ -19,6 +19,8 @@
 # Written by Paolo Bonzini <pbonzini@redhat.com>
 # Based on Brendan Gregg's stackcollapse-perf.pl script.

+from __future__ import print_function
+
 import os
 import sys
 from collections import defaultdict
@ -120,7 +122,6 @@ def process_event(param_dict):
    lines[stack_string] = lines[stack_string] + 1

 def trace_end():
-    list = lines.keys()
-    list.sort()
+    list = sorted(lines)
    for stack in list:
-        print "%s %d" % (stack, lines[stack])
+        print("%s %d" % (stack, lines[stack]))
--- a/tools/perf/scripts/python/stat-cpi.py
+++ b/tools/perf/scripts/python/stat-cpi.py
@ -1,5 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0

+from __future__ import print_function
+
 data    = {}
 times   = []
 threads = []
@ -19,8 +21,8 @@ def store_key(time, cpu, thread):
        threads.append(thread)

 def store(time, event, cpu, thread, val, ena, run):
-    #print "event %s cpu %d, thread %d, time %d, val %d, ena %d, run %d" % \
-    #      (event, cpu, thread, time, val, ena, run)
+    #print("event %s cpu %d, thread %d, time %d, val %d, ena %d, run %d" %
+    #      (event, cpu, thread, time, val, ena, run))

    store_key(time, cpu, thread)
    key = get_key(time, event, cpu, thread)
@ -58,7 +60,7 @@ def stat__interval(time):
            if ins != 0:
                cpi = cyc/float(ins)

-            print "%15f: cpu %d, thread %d -> cpi %f (%d/%d)" % (time/(float(1000000000)), cpu, thread, cpi, cyc, ins)
+            print("%15f: cpu %d, thread %d -> cpi %f (%d/%d)" % (time/(float(1000000000)), cpu, thread, cpi, cyc, ins))

 def trace_end():
    pass
@ -74,4 +76,4 @@ def trace_end():
 #                if ins != 0:
 #                    cpi = cyc/float(ins)
 #
-#                print "time %.9f, cpu %d, thread %d -> cpi %f" % (time/(float(1000000000)), cpu, thread, cpi)
+#                print("time %.9f, cpu %d, thread %d -> cpi %f" % (time/(float(1000000000)), cpu, thread, cpi))
--- a/tools/perf/scripts/python/syscall-counts-by-pid.py
+++ b/tools/perf/scripts/python/syscall-counts-by-pid.py
@ -5,6 +5,8 @@
 # Displays system-wide system call totals, broken down by syscall.
 # If a [comm] arg is specified, only syscalls called by [comm] are displayed.

+from __future__ import print_function
+
 import os, sys

 sys.path.append(os.environ['PERF_EXEC_PATH'] + \
@ -31,7 +33,7 @@ if len(sys.argv) > 1:
 syscalls = autodict()

 def trace_begin():
-	print "Press control+C to stop and show the summary"
+	print("Press control+C to stop and show the summary")

 def trace_end():
 	print_syscall_totals()
@ -55,20 +57,20 @@ def syscalls__sys_enter(event_name, context, common_cpu,

 def print_syscall_totals():
    if for_comm is not None:
-	    print "\nsyscall events for %s:\n\n" % (for_comm),
+	    print("\nsyscall events for %s:\n" % (for_comm))
    else:
-	    print "\nsyscall events by comm/pid:\n\n",
+	    print("\nsyscall events by comm/pid:\n")

-    print "%-40s  %10s\n" % ("comm [pid]/syscalls", "count"),
-    print "%-40s  %10s\n" % ("----------------------------------------", \
-                                 "----------"),
+    print("%-40s  %10s" % ("comm [pid]/syscalls", "count"))
+    print("%-40s  %10s" % ("----------------------------------------",
+                            "----------"))

    comm_keys = syscalls.keys()
    for comm in comm_keys:
 	    pid_keys = syscalls[comm].keys()
 	    for pid in pid_keys:
-		    print "\n%s [%d]\n" % (comm, pid),
+		    print("\n%s [%d]" % (comm, pid))
 		    id_keys = syscalls[comm][pid].keys()
-		    for id, val in sorted(syscalls[comm][pid].iteritems(), \
-				  key = lambda(k, v): (v, k),  reverse = True):
-			    print "  %-38s  %10d\n" % (syscall_name(id), val),
+		    for id, val in sorted(syscalls[comm][pid].items(), \
+				  key = lambda kv: (kv[1], kv[0]),  reverse = True):
+			    print("  %-38s  %10d" % (syscall_name(id), val))
--- a/tools/perf/scripts/python/syscall-counts.py
+++ b/tools/perf/scripts/python/syscall-counts.py
@ -5,6 +5,8 @@
 # Displays system-wide system call totals, broken down by syscall.
 # If a [comm] arg is specified, only syscalls called by [comm] are displayed.

+from __future__ import print_function
+
 import os
 import sys

@ -28,7 +30,7 @@ if len(sys.argv) > 1:
 syscalls = autodict()

 def trace_begin():
-	print "Press control+C to stop and show the summary"
+	print("Press control+C to stop and show the summary")

 def trace_end():
 	print_syscall_totals()
@ -51,14 +53,14 @@ def syscalls__sys_enter(event_name, context, common_cpu,

 def print_syscall_totals():
    if for_comm is not None:
-	    print "\nsyscall events for %s:\n\n" % (for_comm),
+	    print("\nsyscall events for %s:\n" % (for_comm))
    else:
-	    print "\nsyscall events:\n\n",
+	    print("\nsyscall events:\n")

-    print "%-40s  %10s\n" % ("event", "count"),
-    print "%-40s  %10s\n" % ("----------------------------------------", \
-                                 "-----------"),
+    print("%-40s  %10s" % ("event", "count"))
+    print("%-40s  %10s" % ("----------------------------------------",
+                              "-----------"))

-    for id, val in sorted(syscalls.iteritems(), key = lambda(k, v): (v, k), \
+    for id, val in sorted(syscalls.items(), key = lambda kv: (kv[1], kv[0]), \
 				  reverse = True):
-	    print "%-40s  %10d\n" % (syscall_name(id), val),
+	    print("%-40s  %10d" % (syscall_name(id), val))
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@ -1891,6 +1891,7 @@ int symbol__annotate(struct symbol *sym, struct map *map,
 		     struct annotation_options *options,
 		     struct arch **parch)
 {
+	struct annotation *notes = symbol__annotation(sym);
 	struct annotate_args args = {
 		.privsize	= privsize,
 		.evsel		= evsel,
@ -1921,6 +1922,7 @@ int symbol__annotate(struct symbol *sym, struct map *map,

 	args.ms.map = map;
 	args.ms.sym = sym;
+	notes->start = map__rip_2objdump(map, sym->start);

 	return symbol__disassemble(sym, &args);
 }
@ -2796,8 +2798,6 @@ int symbol__annotate2(struct symbol *sym, struct map *map, struct perf_evsel *ev

 	symbol__calc_percent(sym, evsel);

-	notes->start = map__rip_2objdump(map, sym->start);
-
 	annotation__set_offsets(notes, size);
 	annotation__mark_jump_targets(notes, sym);
 	annotation__compute_ipc(notes, size);
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@ -1578,7 +1578,7 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 {
 	struct perf_session *session;
 	struct perf_data data = {
-		.file      = { .path = input, .fd = -1 },
+		.path	   = input,
 		.mode      = PERF_DATA_MODE_READ,
 		.force     = opts->force,
 	};
@ -1650,7 +1650,7 @@ int bt_convert__perf2ctf(const char *input, const char *path,

 	fprintf(stderr,
 		"[ perf data convert: Converted '%s' into CTF data '%s' ]\n",
-		data.file.path, path);
+		data.path, path);

 	fprintf(stderr,
 		"[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples",
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c
@ -7,11 +7,117 @@
 #include <fcntl.h>
 #include <unistd.h>
 #include <string.h>
+#include <asm/bug.h>
+#include <sys/types.h>
+#include <dirent.h>

 #include "data.h"
 #include "util.h"
 #include "debug.h"

+static void close_dir(struct perf_data_file *files, int nr)
+{
+	while (--nr >= 1) {
+		close(files[nr].fd);
+		free(files[nr].path);
+	}
+	free(files);
+}
+
+void perf_data__close_dir(struct perf_data *data)
+{
+	close_dir(data->dir.files, data->dir.nr);
+}
+
+int perf_data__create_dir(struct perf_data *data, int nr)
+{
+	struct perf_data_file *files = NULL;
+	int i, ret = -1;
+
+	files = zalloc(nr * sizeof(*files));
+	if (!files)
+		return -ENOMEM;
+
+	data->dir.files = files;
+	data->dir.nr    = nr;
+
+	for (i = 0; i < nr; i++) {
+		struct perf_data_file *file = &files[i];
+
+		if (asprintf(&file->path, "%s/data.%d", data->path, i) < 0)
+			goto out_err;
+
+		ret = open(file->path, O_RDWR|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR);
+		if (ret < 0)
+			goto out_err;
+
+		file->fd = ret;
+	}
+
+	return 0;
+
+out_err:
+	close_dir(files, i);
+	return ret;
+}
+
+int perf_data__open_dir(struct perf_data *data)
+{
+	struct perf_data_file *files = NULL;
+	struct dirent *dent;
+	int ret = -1;
+	DIR *dir;
+	int nr = 0;
+
+	dir = opendir(data->path);
+	if (!dir)
+		return -EINVAL;
+
+	while ((dent = readdir(dir)) != NULL) {
+		struct perf_data_file *file;
+		char path[PATH_MAX];
+		struct stat st;
+
+		snprintf(path, sizeof(path), "%s/%s", data->path, dent->d_name);
+		if (stat(path, &st))
+			continue;
+
+		if (!S_ISREG(st.st_mode) || strncmp(dent->d_name, "data", 4))
+			continue;
+
+		ret = -ENOMEM;
+
+		file = realloc(files, (nr + 1) * sizeof(*files));
+		if (!file)
+			goto out_err;
+
+		files = file;
+		file = &files[nr++];
+
+		file->path = strdup(path);
+		if (!file->path)
+			goto out_err;
+
+		ret = open(file->path, O_RDONLY);
+		if (ret < 0)
+			goto out_err;
+
+		file->fd = ret;
+		file->size = st.st_size;
+	}
+
+	if (!files)
+		return -EINVAL;
+
+	data->dir.files = files;
+	data->dir.nr    = nr;
+	return 0;
+
+out_err:
+	close_dir(files, nr);
+	return ret;
+}
+
 static bool check_pipe(struct perf_data *data)
 {
 	struct stat st;
@ -19,11 +125,11 @@ static bool check_pipe(struct perf_data *data)
 	int fd = perf_data__is_read(data) ?
 		 STDIN_FILENO : STDOUT_FILENO;

-	if (!data->file.path) {
+	if (!data->path) {
 		if (!fstat(fd, &st) && S_ISFIFO(st.st_mode))
 			is_pipe = true;
 	} else {
-		if (!strcmp(data->file.path, "-"))
+		if (!strcmp(data->path, "-"))
 			is_pipe = true;
 	}

@ -37,13 +143,31 @@ static int check_backup(struct perf_data *data)
 {
 	struct stat st;

-	if (!stat(data->file.path, &st) && st.st_size) {
-		/* TODO check errors properly */
+	if (perf_data__is_read(data))
+		return 0;
+
+	if (!stat(data->path, &st) && st.st_size) {
 		char oldname[PATH_MAX];
+		int ret;
+
 		snprintf(oldname, sizeof(oldname), "%s.old",
-			 data->file.path);
-		unlink(oldname);
-		rename(data->file.path, oldname);
+			 data->path);
+
+		ret = rm_rf_perf_data(oldname);
+		if (ret) {
+			pr_err("Can't remove old data: %s (%s)\n",
+			       ret == -2 ?
+			       "Unknown file found" : strerror(errno),
+			       oldname);
+			return -1;
+		}
+
+		if (rename(data->path, oldname)) {
+			pr_err("Can't move data: %s (%s to %s)\n",
+			       strerror(errno),
+			       data->path, oldname);
+			return -1;
+		}
 	}

 	return 0;
@ -82,7 +206,7 @@ static int open_file_read(struct perf_data *data)
 		goto out_close;
 	}

-	data->size = st.st_size;
+	data->file.size = st.st_size;
 	return fd;

 out_close:
@ -95,9 +219,6 @@ static int open_file_write(struct perf_data *data)
 	int fd;
 	char sbuf[STRERR_BUFSIZE];

-	if (check_backup(data))
-		return -1;
-
 	fd = open(data->file.path, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC,
 		  S_IRUSR|S_IWUSR);

@ -115,8 +236,22 @@ static int open_file(struct perf_data *data)
 	fd = perf_data__is_read(data) ?
 	     open_file_read(data) : open_file_write(data);

+	if (fd < 0) {
+		free(data->file.path);
+		return -1;
+	}
+
 	data->file.fd = fd;
-	return fd < 0 ? -1 : 0;
+	return 0;
+}
+
+static int open_file_dup(struct perf_data *data)
+{
+	data->file.path = strdup(data->path);
+	if (!data->file.path)
+		return -ENOMEM;
+
+	return open_file(data);
 }

 int perf_data__open(struct perf_data *data)
@ -124,14 +259,18 @@ int perf_data__open(struct perf_data *data)
 	if (check_pipe(data))
 		return 0;

-	if (!data->file.path)
-		data->file.path = "perf.data";
+	if (!data->path)
+		data->path = "perf.data";

-	return open_file(data);
+	if (check_backup(data))
+		return -1;
+
+	return open_file_dup(data);
 }

 void perf_data__close(struct perf_data *data)
 {
+	free(data->file.path);
 	close(data->file.fd);
 }

@ -159,15 +298,15 @@ int perf_data__switch(struct perf_data *data,
 	if (perf_data__is_read(data))
 		return -EINVAL;

-	if (asprintf(&new_filepath, "%s.%s", data->file.path, postfix) < 0)
+	if (asprintf(&new_filepath, "%s.%s", data->path, postfix) < 0)
 		return -ENOMEM;

 	/*
 	 * Only fire a warning, don't return error, continue fill
 	 * original file.
 	 */
-	if (rename(data->file.path, new_filepath))
-		pr_warning("Failed to rename %s to %s\n", data->file.path, new_filepath);
+	if (rename(data->path, new_filepath))
+		pr_warning("Failed to rename %s to %s\n", data->path, new_filepath);

 	if (!at_exit) {
 		close(data->file.fd);
--- a/tools/perf/util/data.h
+++ b/tools/perf/util/data.h
@ -10,16 +10,22 @@ enum perf_data_mode {
 };

 struct perf_data_file {
-	const char	*path;
+	char		*path;
 	int		 fd;
+	unsigned long	 size;
 };

 struct perf_data {
+	const char		*path;
 	struct perf_data_file	 file;
 	bool			 is_pipe;
 	bool			 force;
-	unsigned long		 size;
 	enum perf_data_mode	 mode;
+
+	struct {
+		struct perf_data_file	*files;
+		int			 nr;
+	} dir;
 };

 static inline bool perf_data__is_read(struct perf_data *data)
@ -44,7 +50,7 @@ static inline int perf_data__fd(struct perf_data *data)

 static inline unsigned long perf_data__size(struct perf_data *data)
 {
-	return data->size;
+	return data->file.size;
 }

 int perf_data__open(struct perf_data *data);
@ -63,4 +69,8 @@ ssize_t perf_data_file__write(struct perf_data_file *file,
 int perf_data__switch(struct perf_data *data,
 			   const char *postfix,
 			   size_t pos, bool at_exit);
+
+int perf_data__create_dir(struct perf_data *data, int nr);
+int perf_data__open_dir(struct perf_data *data);
+void perf_data__close_dir(struct perf_data *data);
 #endif /* __PERF_DATA_H */
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@ -527,17 +527,11 @@ static int write_event_desc(struct feat_fd *ff,
 static int write_cmdline(struct feat_fd *ff,
 			 struct perf_evlist *evlist __maybe_unused)
 {
-	char buf[MAXPATHLEN];
-	u32 n;
-	int i, ret;
+	char pbuf[MAXPATHLEN], *buf;
+	int i, ret, n;

 	/* actual path to perf binary */
-	ret = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
-	if (ret <= 0)
-		return -1;
-
-	/* readlink() does not add null termination */
-	buf[ret] = '\0';
+	buf = perf_exe(pbuf, MAXPATHLEN);

 	/* account for binary path */
 	n = perf_env.nr_cmdline + 1;
--- a/tools/perf/util/thread-stack.c
+++ b/tools/perf/util/thread-stack.c
@ -20,6 +20,7 @@
 #include "thread.h"
 #include "event.h"
 #include "machine.h"
+#include "env.h"
 #include "util.h"
 #include "debug.h"
 #include "symbol.h"
@ -29,6 +30,19 @@

 #define STACK_GROWTH 2048

+/*
+ * State of retpoline detection.
+ *
+ * RETPOLINE_NONE: no retpoline detection
+ * X86_RETPOLINE_POSSIBLE: x86 retpoline possible
+ * X86_RETPOLINE_DETECTED: x86 retpoline detected
+ */
+enum retpoline_state_t {
+	RETPOLINE_NONE,
+	X86_RETPOLINE_POSSIBLE,
+	X86_RETPOLINE_DETECTED,
+};
+
 /**
 * struct thread_stack_entry - thread stack entry.
 * @ret_addr: return address
@ -64,6 +78,7 @@ struct thread_stack_entry {
 * @crp: call/return processor
 * @comm: current comm
 * @arr_sz: size of array if this is the first element of an array
+ * @rstate: used to detect retpolines
 */
 struct thread_stack {
 	struct thread_stack_entry *stack;
@ -76,6 +91,7 @@ struct thread_stack {
 	struct call_return_processor *crp;
 	struct comm *comm;
 	unsigned int arr_sz;
+	enum retpoline_state_t rstate;
 };

 /*
@ -115,10 +131,16 @@ static int thread_stack__init(struct thread_stack *ts, struct thread *thread,
 	if (err)
 		return err;

-	if (thread->mg && thread->mg->machine)
-		ts->kernel_start = machine__kernel_start(thread->mg->machine);
-	else
+	if (thread->mg && thread->mg->machine) {
+		struct machine *machine = thread->mg->machine;
+		const char *arch = perf_env__arch(machine->env);
+
+		ts->kernel_start = machine__kernel_start(machine);
+		if (!strcmp(arch, "x86"))
+			ts->rstate = X86_RETPOLINE_POSSIBLE;
+	} else {
 		ts->kernel_start = 1ULL << 63;
+	}
 	ts->crp = crp;

 	return 0;
@ -638,14 +660,57 @@ static int thread_stack__no_call_return(struct thread *thread,
 	else
 		parent = root;

-	/* This 'return' had no 'call', so push and pop top of stack */
-	cp = call_path__findnew(cpr, parent, fsym, ip, ks);
+	if (parent->sym == from_al->sym) {
+		/*
+		 * At the bottom of the stack, assume the missing 'call' was
+		 * before the trace started. So, pop the current symbol and push
+		 * the 'to' symbol.
+		 */
+		if (ts->cnt == 1) {
+			err = thread_stack__call_return(thread, ts, --ts->cnt,
+							tm, ref, false);
+			if (err)
+				return err;
+		}
+
+		if (!ts->cnt) {
+			cp = call_path__findnew(cpr, root, tsym, addr, ks);
+
+			return thread_stack__push_cp(ts, addr, tm, ref, cp,
+						     true, false);
+		}
+
+		/*
+		 * Otherwise assume the 'return' is being used as a jump (e.g.
+		 * retpoline) and just push the 'to' symbol.
+		 */
+		cp = call_path__findnew(cpr, parent, tsym, addr, ks);
+
+		err = thread_stack__push_cp(ts, 0, tm, ref, cp, true, false);
+		if (!err)
+			ts->stack[ts->cnt - 1].non_call = true;
+
+		return err;
+	}
+
+	/*
+	 * Assume 'parent' has not yet returned, so push 'to', and then push and
+	 * pop 'from'.
+	 */
+
+	cp = call_path__findnew(cpr, parent, tsym, addr, ks);

 	err = thread_stack__push_cp(ts, addr, tm, ref, cp, true, false);
 	if (err)
 		return err;

-	return thread_stack__pop_cp(thread, ts, addr, tm, ref, tsym);
+	cp = call_path__findnew(cpr, cp, fsym, ip, ks);
+
+	err = thread_stack__push_cp(ts, ip, tm, ref, cp, true, false);
+	if (err)
+		return err;
+
+	return thread_stack__call_return(thread, ts, --ts->cnt, tm, ref, false);
 }

 static int thread_stack__trace_begin(struct thread *thread,
@ -690,6 +755,70 @@ static int thread_stack__trace_end(struct thread_stack *ts,
 				     false, true);
 }

+static bool is_x86_retpoline(const char *name)
+{
+	const char *p = strstr(name, "__x86_indirect_thunk_");
+
+	return p == name || !strcmp(name, "__indirect_thunk_start");
+}
+
+/*
+ * x86 retpoline functions pollute the call graph. This function removes them.
+ * This does not handle function return thunks, nor is there any improvement
+ * for the handling of inline thunks or extern thunks.
+ */
+static int thread_stack__x86_retpoline(struct thread_stack *ts,
+				       struct perf_sample *sample,
+				       struct addr_location *to_al)
+{
+	struct thread_stack_entry *tse = &ts->stack[ts->cnt - 1];
+	struct call_path_root *cpr = ts->crp->cpr;
+	struct symbol *sym = tse->cp->sym;
+	struct symbol *tsym = to_al->sym;
+	struct call_path *cp;
+
+	if (sym && is_x86_retpoline(sym->name)) {
+		/*
+		 * This is a x86 retpoline fn. It pollutes the call graph by
+		 * showing up everywhere there is an indirect branch, but does
+		 * not itself mean anything. Here the top-of-stack is removed,
+		 * by decrementing the stack count, and then further down, the
+		 * resulting top-of-stack is replaced with the actual target.
+		 * The result is that the retpoline functions will no longer
+		 * appear in the call graph. Note this only affects the call
+		 * graph, since all the original branches are left unchanged.
+		 */
+		ts->cnt -= 1;
+		sym = ts->stack[ts->cnt - 2].cp->sym;
+		if (sym && sym == tsym && to_al->addr != tsym->start) {
+			/*
+			 * Target is back to the middle of the symbol we came
+			 * from so assume it is an indirect jmp and forget it
+			 * altogether.
+			 */
+			ts->cnt -= 1;
+			return 0;
+		}
+	} else if (sym && sym == tsym) {
+		/*
+		 * Target is back to the symbol we came from so assume it is an
+		 * indirect jmp and forget it altogether.
+		 */
+		ts->cnt -= 1;
+		return 0;
+	}
+
+	cp = call_path__findnew(cpr, ts->stack[ts->cnt - 2].cp, tsym,
+				sample->addr, ts->kernel_start);
+	if (!cp)
+		return -ENOMEM;
+
+	/* Replace the top-of-stack with the actual target */
+	ts->stack[ts->cnt - 1].cp = cp;
+
+	return 0;
+}
+
 int thread_stack__process(struct thread *thread, struct comm *comm,
 			  struct perf_sample *sample,
 			  struct addr_location *from_al,
@ -697,6 +826,7 @@ int thread_stack__process(struct thread *thread, struct comm *comm,
 			  struct call_return_processor *crp)
 {
 	struct thread_stack *ts = thread__stack(thread, sample->cpu);
+	enum retpoline_state_t rstate;
 	int err = 0;

 	if (ts && !ts->crp) {
@ -712,6 +842,10 @@ int thread_stack__process(struct thread *thread, struct comm *comm,
 		ts->comm = comm;
 	}

+	rstate = ts->rstate;
+	if (rstate == X86_RETPOLINE_DETECTED)
+		ts->rstate = X86_RETPOLINE_POSSIBLE;
+
 	/* Flush stack on exec */
 	if (ts->comm != comm && thread->pid_ == thread->tid) {
 		err = __thread_stack__flush(thread, ts);
@ -748,10 +882,25 @@ int thread_stack__process(struct thread *thread, struct comm *comm,
 					ts->kernel_start);
 		err = thread_stack__push_cp(ts, ret_addr, sample->time, ref,
 					    cp, false, trace_end);
+
+		/*
+		 * A call to the same symbol but not the start of the symbol,
+		 * may be the start of a x86 retpoline.
+		 */
+		if (!err && rstate == X86_RETPOLINE_POSSIBLE && to_al->sym &&
+		    from_al->sym == to_al->sym &&
+		    to_al->addr != to_al->sym->start)
+			ts->rstate = X86_RETPOLINE_DETECTED;
+
 	} else if (sample->flags & PERF_IP_FLAG_RETURN) {
 		if (!sample->ip || !sample->addr)
 			return 0;

+		/* x86 retpoline 'return' doesn't match the stack */
+		if (rstate == X86_RETPOLINE_DETECTED && ts->cnt > 2 &&
+		    ts->stack[ts->cnt - 1].ret_addr != sample->addr)
+			return thread_stack__x86_retpoline(ts, sample, to_al);
+
 		err = thread_stack__pop_cp(thread, ts, sample->addr,
 					   sample->time, ref, from_al->sym);
 		if (err) {
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@ -21,6 +21,7 @@
 #include <linux/time64.h>
 #include <unistd.h>
 #include "strlist.h"
+#include "string2.h"

 /*
 * XXX We need to find a better place for these things...
@ -117,7 +118,38 @@ int mkdir_p(char *path, mode_t mode)
 	return (stat(path, &st) && mkdir(path, mode)) ? -1 : 0;
 }

-int rm_rf(const char *path)
+static bool match_pat(char *file, const char **pat)
+{
+	int i = 0;
+
+	if (!pat)
+		return true;
+
+	while (pat[i]) {
+		if (strglobmatch(file, pat[i]))
+			return true;
+
+		i++;
+	}
+
+	return false;
+}
+
+/*
+ * The depth specify how deep the removal will go.
+ * 0       - will remove only files under the 'path' directory
+ * 1 .. x  - will dive in x-level deep under the 'path' directory
+ *
+ * If specified the pat is array of string patterns ended with NULL,
+ * which are checked upon every file/directory found. Only matching
+ * ones are removed.
+ *
+ * The function returns:
+ *    0 on success
+ *   -1 on removal failure with errno set
+ *   -2 on pattern failure
+ */
+static int rm_rf_depth_pat(const char *path, int depth, const char **pat)
 {
 	DIR *dir;
 	int ret;
@ -144,6 +176,9 @@ int rm_rf(const char *path)
 		if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, ".."))
 			continue;

+		if (!match_pat(d->d_name, pat))
+			return -2;
+
 		scnprintf(namebuf, sizeof(namebuf), "%s/%s",
 			  path, d->d_name);

@ -155,7 +190,7 @@ int rm_rf(const char *path)
 		}

 		if (S_ISDIR(statbuf.st_mode))
-			ret = rm_rf(namebuf);
+			ret = depth ? rm_rf_depth_pat(namebuf, depth - 1, pat) : 0;
 		else
 			ret = unlink(namebuf);
 	}
@ -167,6 +202,22 @@ int rm_rf(const char *path)
 	return rmdir(path);
 }

+int rm_rf_perf_data(const char *path)
+{
+	const char *pat[] = {
+		"header",
+		"data.*",
+		NULL,
+	};
+
+	return rm_rf_depth_pat(path, 0, pat);
+}
+
+int rm_rf(const char *path)
+{
+	return rm_rf_depth_pat(path, INT_MAX, NULL);
+}
+
 /* A filter which removes dot files */
 bool lsdir_no_dot_filter(const char *name __maybe_unused, struct dirent *d)
 {
@ -517,3 +568,13 @@ out:

 	return tip;
 }
+
+char *perf_exe(char *buf, int len)
+{
+	int n = readlink("/proc/self/exe", buf, len);
+	if (n > 0) {
+		buf[n] = 0;
+		return buf;
+	}
+	return strcpy(buf, "perf");
+}
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@ -31,6 +31,7 @@ struct strlist;

 int mkdir_p(char *path, mode_t mode);
 int rm_rf(const char *path);
+int rm_rf_perf_data(const char *path);
 struct strlist *lsdir(const char *name, bool (*filter)(const char *, struct dirent *));
 bool lsdir_no_dot_filter(const char *name, struct dirent *d);
 int copyfile(const char *from, const char *to);
@ -76,6 +77,8 @@ extern bool perf_singlethreaded;
 void perf_set_singlethreaded(void);
 void perf_set_multithreaded(void);

+char *perf_exe(char *buf, int len);
+
 #ifndef O_CLOEXEC
 #ifdef __sparc__
 #define O_CLOEXEC      0x400000