2019-05-19 20:08:55 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2011-03-03 10:34:47 +08:00
|
|
|
/*
|
2011-06-06 22:57:03 +08:00
|
|
|
* Per core/cpu state
|
|
|
|
*
|
|
|
|
* Used to coordinate shared registers between HT threads or
|
|
|
|
* among events on a single PMU.
|
2011-03-03 10:34:47 +08:00
|
|
|
*/
|
2011-08-31 07:41:05 +08:00
|
|
|
|
2012-05-22 10:50:07 +08:00
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
#include <linux/stddef.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/slab.h>
|
2011-05-27 00:22:53 +08:00
|
|
|
#include <linux/export.h>
|
2015-09-05 06:45:12 +08:00
|
|
|
#include <linux/nmi.h>
|
2011-08-31 07:41:05 +08:00
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
#include <asm/cpufeature.h>
|
2011-08-31 07:41:05 +08:00
|
|
|
#include <asm/hardirq.h>
|
2016-06-03 08:19:29 +08:00
|
|
|
#include <asm/intel-family.h>
|
perf/x86/intel: Support PEBS output to PT
If PEBS declares ability to output its data to Intel PT stream, use the
aux_output attribute bit to enable PEBS data output to PT. This requires
a PT event to be present and scheduled in the same context. Unlike the
DS area, the kernel does not extract PEBS records from the PT stream to
generate corresponding records in the perf stream, because that would
require real time in-kernel PT decoding, which is not feasible. The PMI,
however, can still be used.
The output setting is per-CPU, so all PEBS events must be either writing
to PT or to the DS area, therefore, in case of conflict, the conflicting
event will fail to schedule, allowing the rotation logic to alternate
between the PEBS->PT and PEBS->DS events.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: kan.liang@linux.intel.com
Link: https://lkml.kernel.org/r/20190806084606.4021-3-alexander.shishkin@linux.intel.com
2019-08-06 16:46:01 +08:00
|
|
|
#include <asm/intel_pt.h>
|
2011-08-31 07:41:05 +08:00
|
|
|
#include <asm/apic.h>
|
2019-02-05 06:23:30 +08:00
|
|
|
#include <asm/cpu_device_id.h>
|
2011-08-31 07:41:05 +08:00
|
|
|
|
2016-02-10 17:55:23 +08:00
|
|
|
#include "../perf_event.h"
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
2010-02-01 22:36:30 +08:00
|
|
|
* Intel PerfMon, used on Core and later.
|
2010-02-26 19:05:05 +08:00
|
|
|
*/
|
2011-04-27 17:51:41 +08:00
|
|
|
static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2012-07-06 17:59:46 +08:00
|
|
|
[PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
|
|
|
|
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
|
|
|
|
[PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
|
|
|
|
[PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
|
|
|
|
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
|
|
|
|
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
|
|
|
|
[PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
|
|
|
|
[PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
|
2010-02-26 19:05:05 +08:00
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_core_event_constraints[] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_core2_event_constraints[] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-02-01 22:36:30 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
2011-12-11 07:28:51 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2010-02-26 19:05:05 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
|
2010-02-01 22:36:30 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
|
2010-02-26 19:05:05 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-02-01 22:36:30 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
2011-12-11 07:28:51 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2010-02-26 19:05:05 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
|
2011-03-03 10:34:47 +08:00
|
|
|
{
|
2013-07-18 17:02:23 +08:00
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
|
2013-01-24 23:10:32 +08:00
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
|
2011-03-03 10:34:47 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-02-01 22:36:30 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
2011-12-11 07:28:51 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2010-02-26 19:05:05 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
|
2010-06-10 19:25:01 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
|
2010-02-26 19:05:05 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_snb_event_constraints[] __read_mostly =
|
2011-03-02 21:27:04 +08:00
|
|
|
{
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
2011-12-11 07:28:51 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2013-03-17 21:49:57 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
|
2011-03-02 21:27:04 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
|
2013-03-09 07:22:48 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
|
2014-11-18 03:06:59 +08:00
|
|
|
|
2016-07-02 06:22:22 +08:00
|
|
|
/*
|
|
|
|
* When HT is off these events can only run on the bottom 4 counters
|
|
|
|
* When HT is on, they are impacted by the HT bug and require EXCL access
|
|
|
|
*/
|
2014-11-18 03:06:59 +08:00
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
|
|
|
|
|
2011-03-02 21:27:04 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2013-02-20 18:15:12 +08:00
|
|
|
static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
|
|
|
|
{
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
|
2013-09-11 23:22:22 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
|
2013-02-20 18:15:12 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
|
2014-11-18 03:06:59 +08:00
|
|
|
|
2016-07-02 06:22:22 +08:00
|
|
|
/*
|
|
|
|
* When HT is off these events can only run on the bottom 4 counters
|
|
|
|
* When HT is on, they are impacted by the HT bug and require EXCL access
|
|
|
|
*/
|
2014-11-18 03:06:59 +08:00
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
|
|
|
|
|
2013-02-20 18:15:12 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
|
2011-03-03 10:34:47 +08:00
|
|
|
{
|
2013-07-18 17:02:23 +08:00
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
|
2013-01-24 23:10:32 +08:00
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
|
2011-03-03 10:34:47 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2011-06-29 23:42:36 +08:00
|
|
|
static struct event_constraint intel_v1_event_constraints[] __read_mostly =
|
|
|
|
{
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_gen_event_constraints[] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-02-01 22:36:30 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
2011-12-11 07:28:51 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2010-02-26 19:05:05 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2013-07-18 17:02:24 +08:00
|
|
|
static struct event_constraint intel_slm_event_constraints[] __read_mostly =
|
|
|
|
{
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2016-05-17 05:16:18 +08:00
|
|
|
static struct event_constraint intel_skl_event_constraints[] = {
|
2015-05-11 03:22:44 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
|
2016-07-02 06:22:22 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* when HT is off, these can only run on the bottom 4 counters
|
|
|
|
*/
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xcd, 0xf), /* MEM_TRANS_RETIRED.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xc6, 0xf), /* FRONTEND_RETIRED.* */
|
|
|
|
|
2015-05-11 03:22:44 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2015-12-08 06:28:18 +08:00
|
|
|
static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
|
perf/x86/intel: Change offcore response masks for Knights Landing
Due to change in register definition we need to update OCR mask:
MSR_OFFCORE_RESP0 reserved bits: 3,4,18,29,30,33,34, 8,11,14
MSR_OFFCORE_RESP1 reserved bits: 3,4,18,29,30,33,34, 38
Reported-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Lukasz Odzioba <lukasz.odzioba@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: akpm@linux-foundation.org
Cc: hpa@zytor.com
Cc: kan.liang@intel.com
Cc: lukasz.anaczkowski@intel.com
Cc: zheng.z.yan@intel.com
Link: http://lkml.kernel.org/r/1463433419-16893-1-git-send-email-lukasz.odzioba@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-05-17 05:16:59 +08:00
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x799ffbb6e7ull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x399ffbffe7ull, RSP_1),
|
2015-12-08 06:28:18 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2011-06-06 22:57:12 +08:00
|
|
|
static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
|
2013-07-18 17:02:23 +08:00
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
|
2013-01-24 23:10:32 +08:00
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
|
2013-04-16 19:51:43 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
|
2013-07-18 17:02:23 +08:00
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
|
2013-06-08 05:22:10 +08:00
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
|
2011-06-06 22:57:12 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2015-05-11 03:22:44 +08:00
|
|
|
static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
|
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
|
2015-09-10 05:54:00 +08:00
|
|
|
/*
|
|
|
|
* Note the low 8 bits eventsel code is not a continuous field, containing
|
|
|
|
* some #GPing bits. These are masked out.
|
|
|
|
*/
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
|
2015-05-11 03:22:44 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2019-04-03 03:45:05 +08:00
|
|
|
static struct event_constraint intel_icl_event_constraints[] = {
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
2020-09-28 21:47:26 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */
|
2019-04-03 03:45:05 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
|
2020-07-24 01:11:13 +08:00
|
|
|
METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0),
|
|
|
|
METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1),
|
|
|
|
METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2),
|
|
|
|
METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3),
|
2019-04-03 03:45:05 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0x03, 0x0a, 0xf),
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0x1f, 0x28, 0xf),
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x32, 0xf), /* SW_PREFETCH_ACCESS.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf),
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf),
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff), /* CYCLE_ACTIVITY.STALLS_TOTAL */
|
2020-10-20 00:45:29 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.CYCLES_MEM_ANY */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x14a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */
|
2019-04-03 03:45:05 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0xa3, 0xf), /* CYCLE_ACTIVITY.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf),
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf),
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xe6, 0xf),
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0xf0, 0xf4, 0xf),
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct extra_reg intel_icl_extra_regs[] __read_mostly = {
|
perf/x86/intel: Fix invalid Bit 13 for Icelake MSR_OFFCORE_RSP_x register
The Intel SDM states that bit 13 of Icelake's MSR_OFFCORE_RSP_x
register is valid, and used for counting hardware generated prefetches
of L3 cache. Update the bitmask to allow bit 13.
Before:
$ perf stat -e cpu/event=0xb7,umask=0x1,config1=0x1bfff/u sleep 3
Performance counter stats for 'sleep 3':
<not supported> cpu/event=0xb7,umask=0x1,config1=0x1bfff/u
After:
$ perf stat -e cpu/event=0xb7,umask=0x1,config1=0x1bfff/u sleep 3
Performance counter stats for 'sleep 3':
9,293 cpu/event=0xb7,umask=0x1,config1=0x1bfff/u
Signed-off-by: Yunying Sun <yunying.sun@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@kernel.org
Cc: alexander.shishkin@linux.intel.com
Cc: bp@alien8.de
Cc: hpa@zytor.com
Cc: jolsa@redhat.com
Cc: namhyung@kernel.org
Link: https://lkml.kernel.org/r/20190724082932.12833-1-yunying.sun@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-07-24 16:29:32 +08:00
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffffbfffull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffffbfffull, RSP_1),
|
2019-04-03 03:45:05 +08:00
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
|
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
perf/x86/intel: Add perf core PMU support for Sapphire Rapids
Add perf core PMU support for the Intel Sapphire Rapids server, which is
the successor of the Intel Ice Lake server. The enabling code is based
on Ice Lake, but there are several new features introduced.
The event encoding is changed and simplified, e.g., the event codes
which are below 0x90 are restricted to counters 0-3. The event codes
which above 0x90 are likely to have no restrictions. The event
constraints, extra_regs(), and hardware cache events table are changed
accordingly.
A new Precise Distribution (PDist) facility is introduced, which
further minimizes the skid when a precise event is programmed on the GP
counter 0. Enable the Precise Distribution (PDist) facility with :ppp
event. For this facility to work, the period must be initialized with a
value larger than 127. Add spr_limit_period() to apply the limit for
:ppp event.
Two new data source fields, data block & address block, are added in the
PEBS Memory Info Record for the load latency event. To enable the
feature,
- An auxiliary event has to be enabled together with the load latency
event on Sapphire Rapids. A new flag PMU_FL_MEM_LOADS_AUX is
introduced to indicate the case. A new event, mem-loads-aux, is
exposed to sysfs for the user tool.
Add a check in hw_config(). If the auxiliary event is not detected,
return an unique error -ENODATA.
- The union perf_mem_data_src is extended to support the new fields.
- Ice Lake and earlier models do not support block information, but the
fields may be set by HW on some machines. Add pebs_no_block to
explicitly indicate the previous platforms which don't support the new
block fields. Accessing the new block fields are ignored on those
platforms.
A new store Latency facility is introduced, which leverages the PEBS
facility where it can provide additional information about sampled
stores. The additional information includes the data address, memory
auxiliary info (e.g. Data Source, STLB miss) and the latency of the
store access. To enable the facility, the new event (0x02cd) has to be
programed on the GP counter 0. A new flag PERF_X86_EVENT_PEBS_STLAT is
introduced to indicate the event. The store_latency_data() is introduced
to parse the memory auxiliary info.
The layout of access latency field of PEBS Memory Info Record has been
changed. Two latency, instruction latency (bit 15:0) and cache access
latency (bit 47:32) are recorded.
- The cache access latency is similar to previous memory access latency.
For loads, the latency starts by the actual cache access until the
data is returned by the memory subsystem.
For stores, the latency starts when the demand write accesses the L1
data cache and lasts until the cacheline write is completed in the
memory subsystem.
The cache access latency is stored in low 32bits of the sample type
PERF_SAMPLE_WEIGHT_STRUCT.
- The instruction latency starts by the dispatch of the load operation
for execution and lasts until completion of the instruction it belongs
to.
Add a new flag PMU_FL_INSTR_LATENCY to indicate the instruction
latency support. The instruction latency is stored in the bit 47:32
of the sample type PERF_SAMPLE_WEIGHT_STRUCT.
Extends the PERF_METRICS MSR to feature TMA method level 2 metrics. The
lower half of the register is the TMA level 1 metrics (legacy). The
upper half is also divided into four 8-bit fields for the new level 2
metrics. Expose all eight Topdown metrics events to user space.
The full description for the SPR features can be found at Intel
Architecture Instruction Set Extensions and Future Features
Programming Reference, 319433-041.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-5-git-send-email-kan.liang@linux.intel.com
2021-01-29 06:40:10 +08:00
|
|
|
static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
|
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
|
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct event_constraint intel_spr_event_constraints[] = {
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
|
|
|
|
METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0),
|
|
|
|
METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1),
|
|
|
|
METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2),
|
|
|
|
METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3),
|
|
|
|
METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4),
|
|
|
|
METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5),
|
|
|
|
METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6),
|
|
|
|
METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7),
|
|
|
|
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x2e, 0xff),
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x3c, 0xff),
|
|
|
|
/*
|
|
|
|
* Generally event codes < 0x90 are restricted to counters 0-3.
|
|
|
|
* The 0x2E and 0x3C are exception, which has no restriction.
|
|
|
|
*/
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf),
|
|
|
|
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf),
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf),
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x08a3, 0xf),
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1),
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1),
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x02cd, 0x1),
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xce, 0x1),
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf),
|
|
|
|
/*
|
|
|
|
* Generally event codes >= 0x90 are likely to have no restrictions.
|
|
|
|
* The exception are defined as above.
|
|
|
|
*/
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0xff),
|
|
|
|
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2013-09-13 01:17:00 +08:00
|
|
|
EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
|
|
|
|
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
|
|
|
|
EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
|
2013-01-24 23:10:32 +08:00
|
|
|
|
2018-09-06 21:57:48 +08:00
|
|
|
static struct attribute *nhm_mem_events_attrs[] = {
|
2013-01-24 23:10:32 +08:00
|
|
|
EVENT_PTR(mem_ld_nhm),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2016-05-20 08:09:57 +08:00
|
|
|
/*
|
|
|
|
* topdown events for Intel Core CPUs.
|
|
|
|
*
|
|
|
|
* The events are all in slots, which is a free slot in a 4 wide
|
|
|
|
* pipeline. Some events are already reported in slots, for cycle
|
|
|
|
* events we multiply by the pipeline width (4).
|
|
|
|
*
|
|
|
|
* With Hyper Threading on, topdown metrics are either summed or averaged
|
|
|
|
* between the threads of a core: (count_t0 + count_t1).
|
|
|
|
*
|
|
|
|
* For the average case the metric is always scaled to pipeline width,
|
|
|
|
* so we use factor 2 ((count_t0 + count_t1) / 2 * 4)
|
|
|
|
*/
|
|
|
|
|
|
|
|
EVENT_ATTR_STR_HT(topdown-total-slots, td_total_slots,
|
|
|
|
"event=0x3c,umask=0x0", /* cpu_clk_unhalted.thread */
|
|
|
|
"event=0x3c,umask=0x0,any=1"); /* cpu_clk_unhalted.thread_any */
|
|
|
|
EVENT_ATTR_STR_HT(topdown-total-slots.scale, td_total_slots_scale, "4", "2");
|
|
|
|
EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued,
|
|
|
|
"event=0xe,umask=0x1"); /* uops_issued.any */
|
|
|
|
EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired,
|
|
|
|
"event=0xc2,umask=0x2"); /* uops_retired.retire_slots */
|
|
|
|
EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles,
|
|
|
|
"event=0x9c,umask=0x1"); /* idq_uops_not_delivered_core */
|
|
|
|
EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles,
|
|
|
|
"event=0xd,umask=0x3,cmask=1", /* int_misc.recovery_cycles */
|
|
|
|
"event=0xd,umask=0x3,cmask=1,any=1"); /* int_misc.recovery_cycles_any */
|
|
|
|
EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale,
|
|
|
|
"4", "2");
|
|
|
|
|
perf/x86/intel: Add perf core PMU support for Sapphire Rapids
Add perf core PMU support for the Intel Sapphire Rapids server, which is
the successor of the Intel Ice Lake server. The enabling code is based
on Ice Lake, but there are several new features introduced.
The event encoding is changed and simplified, e.g., the event codes
which are below 0x90 are restricted to counters 0-3. The event codes
which above 0x90 are likely to have no restrictions. The event
constraints, extra_regs(), and hardware cache events table are changed
accordingly.
A new Precise Distribution (PDist) facility is introduced, which
further minimizes the skid when a precise event is programmed on the GP
counter 0. Enable the Precise Distribution (PDist) facility with :ppp
event. For this facility to work, the period must be initialized with a
value larger than 127. Add spr_limit_period() to apply the limit for
:ppp event.
Two new data source fields, data block & address block, are added in the
PEBS Memory Info Record for the load latency event. To enable the
feature,
- An auxiliary event has to be enabled together with the load latency
event on Sapphire Rapids. A new flag PMU_FL_MEM_LOADS_AUX is
introduced to indicate the case. A new event, mem-loads-aux, is
exposed to sysfs for the user tool.
Add a check in hw_config(). If the auxiliary event is not detected,
return an unique error -ENODATA.
- The union perf_mem_data_src is extended to support the new fields.
- Ice Lake and earlier models do not support block information, but the
fields may be set by HW on some machines. Add pebs_no_block to
explicitly indicate the previous platforms which don't support the new
block fields. Accessing the new block fields are ignored on those
platforms.
A new store Latency facility is introduced, which leverages the PEBS
facility where it can provide additional information about sampled
stores. The additional information includes the data address, memory
auxiliary info (e.g. Data Source, STLB miss) and the latency of the
store access. To enable the facility, the new event (0x02cd) has to be
programed on the GP counter 0. A new flag PERF_X86_EVENT_PEBS_STLAT is
introduced to indicate the event. The store_latency_data() is introduced
to parse the memory auxiliary info.
The layout of access latency field of PEBS Memory Info Record has been
changed. Two latency, instruction latency (bit 15:0) and cache access
latency (bit 47:32) are recorded.
- The cache access latency is similar to previous memory access latency.
For loads, the latency starts by the actual cache access until the
data is returned by the memory subsystem.
For stores, the latency starts when the demand write accesses the L1
data cache and lasts until the cacheline write is completed in the
memory subsystem.
The cache access latency is stored in low 32bits of the sample type
PERF_SAMPLE_WEIGHT_STRUCT.
- The instruction latency starts by the dispatch of the load operation
for execution and lasts until completion of the instruction it belongs
to.
Add a new flag PMU_FL_INSTR_LATENCY to indicate the instruction
latency support. The instruction latency is stored in the bit 47:32
of the sample type PERF_SAMPLE_WEIGHT_STRUCT.
Extends the PERF_METRICS MSR to feature TMA method level 2 metrics. The
lower half of the register is the TMA level 1 metrics (legacy). The
upper half is also divided into four 8-bit fields for the new level 2
metrics. Expose all eight Topdown metrics events to user space.
The full description for the SPR features can be found at Intel
Architecture Instruction Set Extensions and Future Features
Programming Reference, 319433-041.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-5-git-send-email-kan.liang@linux.intel.com
2021-01-29 06:40:10 +08:00
|
|
|
EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4");
|
|
|
|
EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80");
|
|
|
|
EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81");
|
|
|
|
EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82");
|
|
|
|
EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83");
|
|
|
|
EVENT_ATTR_STR(topdown-heavy-ops, td_heavy_ops, "event=0x00,umask=0x84");
|
|
|
|
EVENT_ATTR_STR(topdown-br-mispredict, td_br_mispredict, "event=0x00,umask=0x85");
|
|
|
|
EVENT_ATTR_STR(topdown-fetch-lat, td_fetch_lat, "event=0x00,umask=0x86");
|
|
|
|
EVENT_ATTR_STR(topdown-mem-bound, td_mem_bound, "event=0x00,umask=0x87");
|
2020-07-24 01:11:13 +08:00
|
|
|
|
2016-05-17 05:16:18 +08:00
|
|
|
static struct attribute *snb_events_attrs[] = {
|
2016-05-20 08:09:57 +08:00
|
|
|
EVENT_PTR(td_slots_issued),
|
|
|
|
EVENT_PTR(td_slots_retired),
|
|
|
|
EVENT_PTR(td_fetch_bubbles),
|
|
|
|
EVENT_PTR(td_total_slots),
|
|
|
|
EVENT_PTR(td_total_slots_scale),
|
|
|
|
EVENT_PTR(td_recovery_bubbles),
|
|
|
|
EVENT_PTR(td_recovery_bubbles_scale),
|
2013-01-24 23:10:32 +08:00
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2018-09-06 21:57:48 +08:00
|
|
|
static struct attribute *snb_mem_events_attrs[] = {
|
|
|
|
EVENT_PTR(mem_ld_snb),
|
|
|
|
EVENT_PTR(mem_st_snb),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
static struct event_constraint intel_hsw_event_constraints[] = {
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2015-11-25 01:05:01 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
|
2013-06-18 08:36:48 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
|
|
|
|
/* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
|
2015-03-10 02:20:22 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
|
2013-06-18 08:36:48 +08:00
|
|
|
/* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
|
2015-03-10 02:20:22 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
|
2013-06-18 08:36:48 +08:00
|
|
|
/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
|
2015-03-10 02:20:22 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
|
2014-11-18 03:06:59 +08:00
|
|
|
|
2016-07-02 06:22:22 +08:00
|
|
|
/*
|
|
|
|
* When HT is off these events can only run on the bottom 4 counters
|
|
|
|
* When HT is on, they are impacted by the HT bug and require EXCL access
|
|
|
|
*/
|
2014-11-18 03:06:59 +08:00
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
|
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2016-05-17 05:16:18 +08:00
|
|
|
static struct event_constraint intel_bdw_event_constraints[] = {
|
2015-02-18 10:18:05 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
|
2015-11-17 08:21:07 +08:00
|
|
|
INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
|
2016-07-02 06:22:22 +08:00
|
|
|
/*
|
|
|
|
* when HT is off, these can only run on the bottom 4 counters
|
|
|
|
*/
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xcd, 0xf), /* MEM_TRANS_RETIRED.* */
|
2015-02-18 10:18:05 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
static u64 intel_pmu_event_map(int hw_event)
|
|
|
|
{
|
|
|
|
return intel_perfmon_event_map[hw_event];
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Add perf core PMU support for Sapphire Rapids
Add perf core PMU support for the Intel Sapphire Rapids server, which is
the successor of the Intel Ice Lake server. The enabling code is based
on Ice Lake, but there are several new features introduced.
The event encoding is changed and simplified, e.g., the event codes
which are below 0x90 are restricted to counters 0-3. The event codes
which above 0x90 are likely to have no restrictions. The event
constraints, extra_regs(), and hardware cache events table are changed
accordingly.
A new Precise Distribution (PDist) facility is introduced, which
further minimizes the skid when a precise event is programmed on the GP
counter 0. Enable the Precise Distribution (PDist) facility with :ppp
event. For this facility to work, the period must be initialized with a
value larger than 127. Add spr_limit_period() to apply the limit for
:ppp event.
Two new data source fields, data block & address block, are added in the
PEBS Memory Info Record for the load latency event. To enable the
feature,
- An auxiliary event has to be enabled together with the load latency
event on Sapphire Rapids. A new flag PMU_FL_MEM_LOADS_AUX is
introduced to indicate the case. A new event, mem-loads-aux, is
exposed to sysfs for the user tool.
Add a check in hw_config(). If the auxiliary event is not detected,
return an unique error -ENODATA.
- The union perf_mem_data_src is extended to support the new fields.
- Ice Lake and earlier models do not support block information, but the
fields may be set by HW on some machines. Add pebs_no_block to
explicitly indicate the previous platforms which don't support the new
block fields. Accessing the new block fields are ignored on those
platforms.
A new store Latency facility is introduced, which leverages the PEBS
facility where it can provide additional information about sampled
stores. The additional information includes the data address, memory
auxiliary info (e.g. Data Source, STLB miss) and the latency of the
store access. To enable the facility, the new event (0x02cd) has to be
programed on the GP counter 0. A new flag PERF_X86_EVENT_PEBS_STLAT is
introduced to indicate the event. The store_latency_data() is introduced
to parse the memory auxiliary info.
The layout of access latency field of PEBS Memory Info Record has been
changed. Two latency, instruction latency (bit 15:0) and cache access
latency (bit 47:32) are recorded.
- The cache access latency is similar to previous memory access latency.
For loads, the latency starts by the actual cache access until the
data is returned by the memory subsystem.
For stores, the latency starts when the demand write accesses the L1
data cache and lasts until the cacheline write is completed in the
memory subsystem.
The cache access latency is stored in low 32bits of the sample type
PERF_SAMPLE_WEIGHT_STRUCT.
- The instruction latency starts by the dispatch of the load operation
for execution and lasts until completion of the instruction it belongs
to.
Add a new flag PMU_FL_INSTR_LATENCY to indicate the instruction
latency support. The instruction latency is stored in the bit 47:32
of the sample type PERF_SAMPLE_WEIGHT_STRUCT.
Extends the PERF_METRICS MSR to feature TMA method level 2 metrics. The
lower half of the register is the TMA level 1 metrics (legacy). The
upper half is also divided into four 8-bit fields for the new level 2
metrics. Expose all eight Topdown metrics events to user space.
The full description for the SPR features can be found at Intel
Architecture Instruction Set Extensions and Future Features
Programming Reference, 319433-041.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-5-git-send-email-kan.liang@linux.intel.com
2021-01-29 06:40:10 +08:00
|
|
|
static __initconst const u64 spr_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0,
|
|
|
|
[ C(RESULT_MISS) ] = 0xe124,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_MISS) ] = 0xe424,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x12a,
|
|
|
|
[ C(RESULT_MISS) ] = 0x12a,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x12a,
|
|
|
|
[ C(RESULT_MISS) ] = 0x12a,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0,
|
|
|
|
[ C(RESULT_MISS) ] = 0xe12,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0,
|
|
|
|
[ C(RESULT_MISS) ] = 0xe13,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = 0xe11,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x4c4,
|
|
|
|
[ C(RESULT_MISS) ] = 0x4c5,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x12a,
|
|
|
|
[ C(RESULT_MISS) ] = 0x12a,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static __initconst const u64 spr_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x10001,
|
|
|
|
[ C(RESULT_MISS) ] = 0x3fbfc00001,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x3f3ffc0002,
|
|
|
|
[ C(RESULT_MISS) ] = 0x3f3fc00002,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x10c000001,
|
|
|
|
[ C(RESULT_MISS) ] = 0x3fb3000001,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2015-05-11 03:22:44 +08:00
|
|
|
/*
|
|
|
|
* Notes on the events:
|
|
|
|
* - data reads do not include code reads (comparable to earlier tables)
|
|
|
|
* - data counts include speculative execution (except L1 write, dtlb, bpu)
|
|
|
|
* - remote node access includes remote memory, remote cache, remote mmio.
|
|
|
|
* - prefetches are not included in the counts.
|
|
|
|
* - icache miss does not include decoded icache
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define SKL_DEMAND_DATA_RD BIT_ULL(0)
|
|
|
|
#define SKL_DEMAND_RFO BIT_ULL(1)
|
|
|
|
#define SKL_ANY_RESPONSE BIT_ULL(16)
|
|
|
|
#define SKL_SUPPLIER_NONE BIT_ULL(17)
|
|
|
|
#define SKL_L3_MISS_LOCAL_DRAM BIT_ULL(26)
|
|
|
|
#define SKL_L3_MISS_REMOTE_HOP0_DRAM BIT_ULL(27)
|
|
|
|
#define SKL_L3_MISS_REMOTE_HOP1_DRAM BIT_ULL(28)
|
|
|
|
#define SKL_L3_MISS_REMOTE_HOP2P_DRAM BIT_ULL(29)
|
|
|
|
#define SKL_L3_MISS (SKL_L3_MISS_LOCAL_DRAM| \
|
|
|
|
SKL_L3_MISS_REMOTE_HOP0_DRAM| \
|
|
|
|
SKL_L3_MISS_REMOTE_HOP1_DRAM| \
|
|
|
|
SKL_L3_MISS_REMOTE_HOP2P_DRAM)
|
|
|
|
#define SKL_SPL_HIT BIT_ULL(30)
|
|
|
|
#define SKL_SNOOP_NONE BIT_ULL(31)
|
|
|
|
#define SKL_SNOOP_NOT_NEEDED BIT_ULL(32)
|
|
|
|
#define SKL_SNOOP_MISS BIT_ULL(33)
|
|
|
|
#define SKL_SNOOP_HIT_NO_FWD BIT_ULL(34)
|
|
|
|
#define SKL_SNOOP_HIT_WITH_FWD BIT_ULL(35)
|
|
|
|
#define SKL_SNOOP_HITM BIT_ULL(36)
|
|
|
|
#define SKL_SNOOP_NON_DRAM BIT_ULL(37)
|
|
|
|
#define SKL_ANY_SNOOP (SKL_SPL_HIT|SKL_SNOOP_NONE| \
|
|
|
|
SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
|
|
|
|
SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
|
|
|
|
SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
|
|
|
|
#define SKL_DEMAND_READ SKL_DEMAND_DATA_RD
|
|
|
|
#define SKL_SNOOP_DRAM (SKL_SNOOP_NONE| \
|
|
|
|
SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
|
|
|
|
SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
|
|
|
|
SKL_SNOOP_HITM|SKL_SPL_HIT)
|
|
|
|
#define SKL_DEMAND_WRITE SKL_DEMAND_RFO
|
|
|
|
#define SKL_LLC_ACCESS SKL_ANY_RESPONSE
|
|
|
|
#define SKL_L3_MISS_REMOTE (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
|
|
|
|
SKL_L3_MISS_REMOTE_HOP1_DRAM| \
|
|
|
|
SKL_L3_MISS_REMOTE_HOP2P_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 skl_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x283, /* ICACHE_64B.MISS */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
|
2017-06-19 22:26:09 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0xe08, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
|
2015-05-11 03:22:44 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
|
2017-06-19 22:26:09 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0xe49, /* DTLB_STORE_MISSES.WALK_COMPLETED */
|
2015-05-11 03:22:44 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x2085, /* ITLB_MISSES.STLB_HIT */
|
|
|
|
[ C(RESULT_MISS) ] = 0xe85, /* ITLB_MISSES.WALK_COMPLETED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static __initconst const u64 skl_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
|
|
|
|
SKL_LLC_ACCESS|SKL_ANY_SNOOP,
|
|
|
|
[ C(RESULT_MISS) ] = SKL_DEMAND_READ|
|
|
|
|
SKL_L3_MISS|SKL_ANY_SNOOP|
|
|
|
|
SKL_SUPPLIER_NONE,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
|
|
|
|
SKL_LLC_ACCESS|SKL_ANY_SNOOP,
|
|
|
|
[ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
|
|
|
|
SKL_L3_MISS|SKL_ANY_SNOOP|
|
|
|
|
SKL_SUPPLIER_NONE,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
|
|
|
|
SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
|
|
|
|
[ C(RESULT_MISS) ] = SKL_DEMAND_READ|
|
|
|
|
SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
|
|
|
|
SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
|
|
|
|
[ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
|
|
|
|
SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2012-07-17 17:27:55 +08:00
|
|
|
#define SNB_DMND_DATA_RD (1ULL << 0)
|
|
|
|
#define SNB_DMND_RFO (1ULL << 1)
|
|
|
|
#define SNB_DMND_IFETCH (1ULL << 2)
|
|
|
|
#define SNB_DMND_WB (1ULL << 3)
|
|
|
|
#define SNB_PF_DATA_RD (1ULL << 4)
|
|
|
|
#define SNB_PF_RFO (1ULL << 5)
|
|
|
|
#define SNB_PF_IFETCH (1ULL << 6)
|
|
|
|
#define SNB_LLC_DATA_RD (1ULL << 7)
|
|
|
|
#define SNB_LLC_RFO (1ULL << 8)
|
|
|
|
#define SNB_LLC_IFETCH (1ULL << 9)
|
|
|
|
#define SNB_BUS_LOCKS (1ULL << 10)
|
|
|
|
#define SNB_STRM_ST (1ULL << 11)
|
|
|
|
#define SNB_OTHER (1ULL << 15)
|
|
|
|
#define SNB_RESP_ANY (1ULL << 16)
|
|
|
|
#define SNB_NO_SUPP (1ULL << 17)
|
|
|
|
#define SNB_LLC_HITM (1ULL << 18)
|
|
|
|
#define SNB_LLC_HITE (1ULL << 19)
|
|
|
|
#define SNB_LLC_HITS (1ULL << 20)
|
|
|
|
#define SNB_LLC_HITF (1ULL << 21)
|
|
|
|
#define SNB_LOCAL (1ULL << 22)
|
|
|
|
#define SNB_REMOTE (0xffULL << 23)
|
|
|
|
#define SNB_SNP_NONE (1ULL << 31)
|
|
|
|
#define SNB_SNP_NOT_NEEDED (1ULL << 32)
|
|
|
|
#define SNB_SNP_MISS (1ULL << 33)
|
|
|
|
#define SNB_NO_FWD (1ULL << 34)
|
|
|
|
#define SNB_SNP_FWD (1ULL << 35)
|
|
|
|
#define SNB_HITM (1ULL << 36)
|
|
|
|
#define SNB_NON_DRAM (1ULL << 37)
|
|
|
|
|
|
|
|
#define SNB_DMND_READ (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
|
|
|
|
#define SNB_DMND_WRITE (SNB_DMND_RFO|SNB_LLC_RFO)
|
|
|
|
#define SNB_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
|
|
|
|
|
|
|
|
#define SNB_SNP_ANY (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
|
|
|
|
SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
|
|
|
|
SNB_HITM)
|
|
|
|
|
|
|
|
#define SNB_DRAM_ANY (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
|
|
|
|
#define SNB_DRAM_REMOTE (SNB_REMOTE|SNB_SNP_ANY)
|
|
|
|
|
|
|
|
#define SNB_L3_ACCESS SNB_RESP_ANY
|
|
|
|
#define SNB_L3_MISS (SNB_DRAM_ANY|SNB_NON_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 snb_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_L3_MISS,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_L3_MISS,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2011-03-02 21:27:04 +08:00
|
|
|
static __initconst const u64 snb_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
|
2011-03-02 21:27:04 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-03-02 21:27:04 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
|
2011-03-02 21:27:04 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-03-02 21:27:04 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
|
2011-03-02 21:27:04 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-03-02 21:27:04 +08:00
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
2011-04-23 05:37:06 +08:00
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
2012-07-17 17:27:55 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
2012-07-17 17:27:55 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
2012-07-17 17:27:55 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
},
|
|
|
|
|
2011-03-02 21:27:04 +08:00
|
|
|
};
|
|
|
|
|
2015-02-18 10:18:04 +08:00
|
|
|
/*
|
|
|
|
* Notes on the events:
|
|
|
|
* - data reads do not include code reads (comparable to earlier tables)
|
|
|
|
* - data counts include speculative execution (except L1 write, dtlb, bpu)
|
|
|
|
* - remote node access includes remote memory, remote cache, remote mmio.
|
|
|
|
* - prefetches are not included in the counts because they are not
|
|
|
|
* reliably counted.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define HSW_DEMAND_DATA_RD BIT_ULL(0)
|
|
|
|
#define HSW_DEMAND_RFO BIT_ULL(1)
|
|
|
|
#define HSW_ANY_RESPONSE BIT_ULL(16)
|
|
|
|
#define HSW_SUPPLIER_NONE BIT_ULL(17)
|
|
|
|
#define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22)
|
|
|
|
#define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27)
|
|
|
|
#define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28)
|
|
|
|
#define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29)
|
|
|
|
#define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \
|
|
|
|
HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
|
|
|
|
HSW_L3_MISS_REMOTE_HOP2P)
|
|
|
|
#define HSW_SNOOP_NONE BIT_ULL(31)
|
|
|
|
#define HSW_SNOOP_NOT_NEEDED BIT_ULL(32)
|
|
|
|
#define HSW_SNOOP_MISS BIT_ULL(33)
|
|
|
|
#define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34)
|
|
|
|
#define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35)
|
|
|
|
#define HSW_SNOOP_HITM BIT_ULL(36)
|
|
|
|
#define HSW_SNOOP_NON_DRAM BIT_ULL(37)
|
|
|
|
#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \
|
|
|
|
HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
|
|
|
|
HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
|
|
|
|
HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
|
|
|
|
#define HSW_SNOOP_DRAM (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
|
|
|
|
#define HSW_DEMAND_READ HSW_DEMAND_DATA_RD
|
|
|
|
#define HSW_DEMAND_WRITE HSW_DEMAND_RFO
|
|
|
|
#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\
|
|
|
|
HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
|
|
|
|
#define HSW_LLC_ACCESS HSW_ANY_RESPONSE
|
|
|
|
|
2015-02-18 10:18:05 +08:00
|
|
|
#define BDW_L3_MISS_LOCAL BIT(26)
|
|
|
|
#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \
|
|
|
|
HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
|
|
|
|
HSW_L3_MISS_REMOTE_HOP2P)
|
|
|
|
|
|
|
|
|
2015-02-18 10:18:04 +08:00
|
|
|
static __initconst const u64 hsw_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */
|
|
|
|
[ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static __initconst const u64 hsw_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
|
|
|
|
HSW_LLC_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = HSW_DEMAND_READ|
|
|
|
|
HSW_L3_MISS|HSW_ANY_SNOOP,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
|
|
|
|
HSW_LLC_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
|
|
|
|
HSW_L3_MISS|HSW_ANY_SNOOP,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
|
|
|
|
HSW_L3_MISS_LOCAL_DRAM|
|
|
|
|
HSW_SNOOP_DRAM,
|
|
|
|
[ C(RESULT_MISS) ] = HSW_DEMAND_READ|
|
|
|
|
HSW_L3_MISS_REMOTE|
|
|
|
|
HSW_SNOOP_DRAM,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
|
|
|
|
HSW_L3_MISS_LOCAL_DRAM|
|
|
|
|
HSW_SNOOP_DRAM,
|
|
|
|
[ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
|
|
|
|
HSW_L3_MISS_REMOTE|
|
|
|
|
HSW_SNOOP_DRAM,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2010-03-29 19:09:53 +08:00
|
|
|
static __initconst const u64 westmere_hw_cache_event_ids
|
2010-02-26 19:05:05 +08:00
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
|
2011-03-03 10:34:48 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
2011-03-03 10:34:48 +08:00
|
|
|
/*
|
|
|
|
* Use RFO, not WRITEBACK, because a write miss would typically occur
|
|
|
|
* on RFO.
|
|
|
|
*/
|
2010-02-26 19:05:05 +08:00
|
|
|
[ C(OP_WRITE) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
|
2011-03-03 10:34:48 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
|
2011-03-03 10:34:48 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
2011-04-23 05:37:06 +08:00
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
},
|
2010-02-26 19:05:05 +08:00
|
|
|
};
|
|
|
|
|
2011-03-03 10:34:48 +08:00
|
|
|
/*
|
2011-04-23 06:57:42 +08:00
|
|
|
* Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
|
|
|
|
* See IA32 SDM Vol 3B 30.6.1.3
|
2011-03-03 10:34:48 +08:00
|
|
|
*/
|
|
|
|
|
2011-04-23 06:57:42 +08:00
|
|
|
#define NHM_DMND_DATA_RD (1 << 0)
|
|
|
|
#define NHM_DMND_RFO (1 << 1)
|
|
|
|
#define NHM_DMND_IFETCH (1 << 2)
|
|
|
|
#define NHM_DMND_WB (1 << 3)
|
|
|
|
#define NHM_PF_DATA_RD (1 << 4)
|
|
|
|
#define NHM_PF_DATA_RFO (1 << 5)
|
|
|
|
#define NHM_PF_IFETCH (1 << 6)
|
|
|
|
#define NHM_OFFCORE_OTHER (1 << 7)
|
|
|
|
#define NHM_UNCORE_HIT (1 << 8)
|
|
|
|
#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
|
|
|
|
#define NHM_OTHER_CORE_HITM (1 << 10)
|
|
|
|
/* reserved */
|
|
|
|
#define NHM_REMOTE_CACHE_FWD (1 << 12)
|
|
|
|
#define NHM_REMOTE_DRAM (1 << 13)
|
|
|
|
#define NHM_LOCAL_DRAM (1 << 14)
|
|
|
|
#define NHM_NON_DRAM (1 << 15)
|
|
|
|
|
perf/x86: Fix local vs remote memory events for NHM/WSM
Verified using the below proglet.. before:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,101,554 node-stores
2,096,931 node-store-misses
5.021546079 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
501,137 node-stores
199 node-store-misses
5.124451068 seconds time elapsed
After:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,107,516 node-stores
2,097,187 node-store-misses
5.012755149 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
2,063,355 node-stores
165 node-store-misses
5.082091494 seconds time elapsed
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>
#define SIZE (32*1024*1024)
volatile int done;
void sig_done(int sig)
{
done = 1;
}
int main(int argc, char **argv)
{
cpu_set_t *mask, *mask2;
size_t size;
int i, err, t;
int nrcpus = 1024;
char *mem;
unsigned long nodemask = 0x01; /* node 0 */
DIR *node;
struct dirent *de;
int read = 0;
int local = 0;
if (argc < 2) {
printf("usage: %s [0-3]\n", argv[0]);
printf(" bit0 - local/remote\n");
printf(" bit1 - read/write\n");
exit(0);
}
switch (atoi(argv[1])) {
case 0:
printf("remote write\n");
break;
case 1:
printf("local write\n");
local = 1;
break;
case 2:
printf("remote read\n");
read = 1;
break;
case 3:
printf("local read\n");
local = 1;
read = 1;
break;
}
mask = CPU_ALLOC(nrcpus);
size = CPU_ALLOC_SIZE(nrcpus);
CPU_ZERO_S(size, mask);
node = opendir("/sys/devices/system/node/node0/");
if (!node)
perror("opendir");
while ((de = readdir(node))) {
int cpu;
if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
CPU_SET_S(cpu, size, mask);
}
closedir(node);
mask2 = CPU_ALLOC(nrcpus);
CPU_ZERO_S(size, mask2);
for (i = 0; i < size; i++)
CPU_SET_S(i, size, mask2);
CPU_XOR_S(size, mask2, mask2, mask); // invert
if (!local)
mask = mask2;
err = sched_setaffinity(0, size, mask);
if (err)
perror("sched_setaffinity");
mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
if (err)
perror("mbind");
signal(SIGALRM, sig_done);
alarm(5);
if (!read) {
while (!done) {
for (i = 0; i < SIZE; i++)
mem[i] = 0x01;
}
} else {
while (!done) {
for (i = 0; i < SIZE; i++)
t += *(volatile char *)(mem + i);
}
}
return 0;
}
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-06 06:59:25 +08:00
|
|
|
#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
|
|
|
|
#define NHM_REMOTE (NHM_REMOTE_DRAM)
|
2011-04-23 06:57:42 +08:00
|
|
|
|
|
|
|
#define NHM_DMND_READ (NHM_DMND_DATA_RD)
|
|
|
|
#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
|
|
|
|
#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
|
|
|
|
|
|
|
|
#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
|
perf/x86: Fix local vs remote memory events for NHM/WSM
Verified using the below proglet.. before:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,101,554 node-stores
2,096,931 node-store-misses
5.021546079 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
501,137 node-stores
199 node-store-misses
5.124451068 seconds time elapsed
After:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,107,516 node-stores
2,097,187 node-store-misses
5.012755149 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
2,063,355 node-stores
165 node-store-misses
5.082091494 seconds time elapsed
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>
#define SIZE (32*1024*1024)
volatile int done;
void sig_done(int sig)
{
done = 1;
}
int main(int argc, char **argv)
{
cpu_set_t *mask, *mask2;
size_t size;
int i, err, t;
int nrcpus = 1024;
char *mem;
unsigned long nodemask = 0x01; /* node 0 */
DIR *node;
struct dirent *de;
int read = 0;
int local = 0;
if (argc < 2) {
printf("usage: %s [0-3]\n", argv[0]);
printf(" bit0 - local/remote\n");
printf(" bit1 - read/write\n");
exit(0);
}
switch (atoi(argv[1])) {
case 0:
printf("remote write\n");
break;
case 1:
printf("local write\n");
local = 1;
break;
case 2:
printf("remote read\n");
read = 1;
break;
case 3:
printf("local read\n");
local = 1;
read = 1;
break;
}
mask = CPU_ALLOC(nrcpus);
size = CPU_ALLOC_SIZE(nrcpus);
CPU_ZERO_S(size, mask);
node = opendir("/sys/devices/system/node/node0/");
if (!node)
perror("opendir");
while ((de = readdir(node))) {
int cpu;
if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
CPU_SET_S(cpu, size, mask);
}
closedir(node);
mask2 = CPU_ALLOC(nrcpus);
CPU_ZERO_S(size, mask2);
for (i = 0; i < size; i++)
CPU_SET_S(i, size, mask2);
CPU_XOR_S(size, mask2, mask2, mask); // invert
if (!local)
mask = mask2;
err = sched_setaffinity(0, size, mask);
if (err)
perror("sched_setaffinity");
mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
if (err)
perror("mbind");
signal(SIGALRM, sig_done);
alarm(5);
if (!read) {
while (!done) {
for (i = 0; i < SIZE; i++)
mem[i] = 0x01;
}
} else {
while (!done) {
for (i = 0; i < SIZE; i++)
t += *(volatile char *)(mem + i);
}
}
return 0;
}
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-06 06:59:25 +08:00
|
|
|
#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
|
2011-04-23 06:57:42 +08:00
|
|
|
#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
|
2011-03-03 10:34:48 +08:00
|
|
|
|
|
|
|
static __initconst const u64 nehalem_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS,
|
2011-03-03 10:34:48 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS,
|
2011-03-03 10:34:48 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
|
2011-03-03 10:34:48 +08:00
|
|
|
},
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
perf/x86: Fix local vs remote memory events for NHM/WSM
Verified using the below proglet.. before:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,101,554 node-stores
2,096,931 node-store-misses
5.021546079 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
501,137 node-stores
199 node-store-misses
5.124451068 seconds time elapsed
After:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,107,516 node-stores
2,097,187 node-store-misses
5.012755149 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
2,063,355 node-stores
165 node-store-misses
5.082091494 seconds time elapsed
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>
#define SIZE (32*1024*1024)
volatile int done;
void sig_done(int sig)
{
done = 1;
}
int main(int argc, char **argv)
{
cpu_set_t *mask, *mask2;
size_t size;
int i, err, t;
int nrcpus = 1024;
char *mem;
unsigned long nodemask = 0x01; /* node 0 */
DIR *node;
struct dirent *de;
int read = 0;
int local = 0;
if (argc < 2) {
printf("usage: %s [0-3]\n", argv[0]);
printf(" bit0 - local/remote\n");
printf(" bit1 - read/write\n");
exit(0);
}
switch (atoi(argv[1])) {
case 0:
printf("remote write\n");
break;
case 1:
printf("local write\n");
local = 1;
break;
case 2:
printf("remote read\n");
read = 1;
break;
case 3:
printf("local read\n");
local = 1;
read = 1;
break;
}
mask = CPU_ALLOC(nrcpus);
size = CPU_ALLOC_SIZE(nrcpus);
CPU_ZERO_S(size, mask);
node = opendir("/sys/devices/system/node/node0/");
if (!node)
perror("opendir");
while ((de = readdir(node))) {
int cpu;
if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
CPU_SET_S(cpu, size, mask);
}
closedir(node);
mask2 = CPU_ALLOC(nrcpus);
CPU_ZERO_S(size, mask2);
for (i = 0; i < size; i++)
CPU_SET_S(i, size, mask2);
CPU_XOR_S(size, mask2, mask2, mask); // invert
if (!local)
mask = mask2;
err = sched_setaffinity(0, size, mask);
if (err)
perror("sched_setaffinity");
mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
if (err)
perror("mbind");
signal(SIGALRM, sig_done);
alarm(5);
if (!read) {
while (!done) {
for (i = 0; i < SIZE; i++)
mem[i] = 0x01;
}
} else {
while (!done) {
for (i = 0; i < SIZE; i++)
t += *(volatile char *)(mem + i);
}
}
return 0;
}
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-06 06:59:25 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
perf/x86: Fix local vs remote memory events for NHM/WSM
Verified using the below proglet.. before:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,101,554 node-stores
2,096,931 node-store-misses
5.021546079 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
501,137 node-stores
199 node-store-misses
5.124451068 seconds time elapsed
After:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,107,516 node-stores
2,097,187 node-store-misses
5.012755149 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
2,063,355 node-stores
165 node-store-misses
5.082091494 seconds time elapsed
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>
#define SIZE (32*1024*1024)
volatile int done;
void sig_done(int sig)
{
done = 1;
}
int main(int argc, char **argv)
{
cpu_set_t *mask, *mask2;
size_t size;
int i, err, t;
int nrcpus = 1024;
char *mem;
unsigned long nodemask = 0x01; /* node 0 */
DIR *node;
struct dirent *de;
int read = 0;
int local = 0;
if (argc < 2) {
printf("usage: %s [0-3]\n", argv[0]);
printf(" bit0 - local/remote\n");
printf(" bit1 - read/write\n");
exit(0);
}
switch (atoi(argv[1])) {
case 0:
printf("remote write\n");
break;
case 1:
printf("local write\n");
local = 1;
break;
case 2:
printf("remote read\n");
read = 1;
break;
case 3:
printf("local read\n");
local = 1;
read = 1;
break;
}
mask = CPU_ALLOC(nrcpus);
size = CPU_ALLOC_SIZE(nrcpus);
CPU_ZERO_S(size, mask);
node = opendir("/sys/devices/system/node/node0/");
if (!node)
perror("opendir");
while ((de = readdir(node))) {
int cpu;
if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
CPU_SET_S(cpu, size, mask);
}
closedir(node);
mask2 = CPU_ALLOC(nrcpus);
CPU_ZERO_S(size, mask2);
for (i = 0; i < size; i++)
CPU_SET_S(i, size, mask2);
CPU_XOR_S(size, mask2, mask2, mask); // invert
if (!local)
mask = mask2;
err = sched_setaffinity(0, size, mask);
if (err)
perror("sched_setaffinity");
mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
if (err)
perror("mbind");
signal(SIGALRM, sig_done);
alarm(5);
if (!read) {
while (!done) {
for (i = 0; i < SIZE; i++)
mem[i] = 0x01;
}
} else {
while (!done) {
for (i = 0; i < SIZE; i++)
t += *(volatile char *)(mem + i);
}
}
return 0;
}
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-06 06:59:25 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
perf/x86: Fix local vs remote memory events for NHM/WSM
Verified using the below proglet.. before:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,101,554 node-stores
2,096,931 node-store-misses
5.021546079 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
501,137 node-stores
199 node-store-misses
5.124451068 seconds time elapsed
After:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,107,516 node-stores
2,097,187 node-store-misses
5.012755149 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
2,063,355 node-stores
165 node-store-misses
5.082091494 seconds time elapsed
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>
#define SIZE (32*1024*1024)
volatile int done;
void sig_done(int sig)
{
done = 1;
}
int main(int argc, char **argv)
{
cpu_set_t *mask, *mask2;
size_t size;
int i, err, t;
int nrcpus = 1024;
char *mem;
unsigned long nodemask = 0x01; /* node 0 */
DIR *node;
struct dirent *de;
int read = 0;
int local = 0;
if (argc < 2) {
printf("usage: %s [0-3]\n", argv[0]);
printf(" bit0 - local/remote\n");
printf(" bit1 - read/write\n");
exit(0);
}
switch (atoi(argv[1])) {
case 0:
printf("remote write\n");
break;
case 1:
printf("local write\n");
local = 1;
break;
case 2:
printf("remote read\n");
read = 1;
break;
case 3:
printf("local read\n");
local = 1;
read = 1;
break;
}
mask = CPU_ALLOC(nrcpus);
size = CPU_ALLOC_SIZE(nrcpus);
CPU_ZERO_S(size, mask);
node = opendir("/sys/devices/system/node/node0/");
if (!node)
perror("opendir");
while ((de = readdir(node))) {
int cpu;
if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
CPU_SET_S(cpu, size, mask);
}
closedir(node);
mask2 = CPU_ALLOC(nrcpus);
CPU_ZERO_S(size, mask2);
for (i = 0; i < size; i++)
CPU_SET_S(i, size, mask2);
CPU_XOR_S(size, mask2, mask2, mask); // invert
if (!local)
mask = mask2;
err = sched_setaffinity(0, size, mask);
if (err)
perror("sched_setaffinity");
mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
if (err)
perror("mbind");
signal(SIGALRM, sig_done);
alarm(5);
if (!read) {
while (!done) {
for (i = 0; i < SIZE; i++)
mem[i] = 0x01;
}
} else {
while (!done) {
for (i = 0; i < SIZE; i++)
t += *(volatile char *)(mem + i);
}
}
return 0;
}
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-06 06:59:25 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
},
|
2011-03-03 10:34:48 +08:00
|
|
|
};
|
|
|
|
|
2010-03-29 19:09:53 +08:00
|
|
|
static __initconst const u64 nehalem_hw_cache_event_ids
|
2010-02-26 19:05:05 +08:00
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
perf, x86: Update/fix Intel Nehalem cache events
Change the Nehalem cache events to use retired memory instruction counters
(similar to Westmere), this greatly improves the provided stats.
Using:
main ()
{
int i;
for (i = 0; i < 1000000000; i++) {
asm("mov (%%rsp), %%rbx;"
"mov %%rbx, (%%rsp);" : : : "rbx");
}
}
We find:
$ perf stat --repeat 10 -e instructions:u -e l1-dcache-loads:u -e l1-dcache-stores:u ./loop_1b_loads+stores
Performance counter stats for './loop_1b_loads+stores' (10 runs):
4,000,081,056 instructions:u # 0.000 IPC ( +- 0.000% )
4,999,502,846 l1-dcache-loads:u ( +- 0.008% )
1,000,034,832 l1-dcache-stores:u ( +- 0.000% )
1.565184942 seconds time elapsed ( +- 0.005% )
The 5b is surprising - we'd expect 1b:
$ perf stat --repeat 10 -e instructions:u -e r10b:u -e l1-dcache-stores:u ./loop_1b_loads+stores
Performance counter stats for './loop_1b_loads+stores' (10 runs):
4,000,081,054 instructions:u # 0.000 IPC ( +- 0.000% )
1,000,021,961 r10b:u ( +- 0.000% )
1,000,030,951 l1-dcache-stores:u ( +- 0.000% )
1.565055422 seconds time elapsed ( +- 0.003% )
Which this patch thus fixes.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Link: http://lkml.kernel.org/n/tip-q9rtru7b7840tws75xzboapv@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-04-22 19:39:56 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
perf, x86: Update/fix Intel Nehalem cache events
Change the Nehalem cache events to use retired memory instruction counters
(similar to Westmere), this greatly improves the provided stats.
Using:
main ()
{
int i;
for (i = 0; i < 1000000000; i++) {
asm("mov (%%rsp), %%rbx;"
"mov %%rbx, (%%rsp);" : : : "rbx");
}
}
We find:
$ perf stat --repeat 10 -e instructions:u -e l1-dcache-loads:u -e l1-dcache-stores:u ./loop_1b_loads+stores
Performance counter stats for './loop_1b_loads+stores' (10 runs):
4,000,081,056 instructions:u # 0.000 IPC ( +- 0.000% )
4,999,502,846 l1-dcache-loads:u ( +- 0.008% )
1,000,034,832 l1-dcache-stores:u ( +- 0.000% )
1.565184942 seconds time elapsed ( +- 0.005% )
The 5b is surprising - we'd expect 1b:
$ perf stat --repeat 10 -e instructions:u -e r10b:u -e l1-dcache-stores:u ./loop_1b_loads+stores
Performance counter stats for './loop_1b_loads+stores' (10 runs):
4,000,081,054 instructions:u # 0.000 IPC ( +- 0.000% )
1,000,021,961 r10b:u ( +- 0.000% )
1,000,030,951 l1-dcache-stores:u ( +- 0.000% )
1.565055422 seconds time elapsed ( +- 0.003% )
Which this patch thus fixes.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Link: http://lkml.kernel.org/n/tip-q9rtru7b7840tws75xzboapv@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-04-22 19:39:56 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
2011-03-03 10:34:48 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
2011-03-03 10:34:48 +08:00
|
|
|
/*
|
|
|
|
* Use RFO, not WRITEBACK, because a write miss would typically occur
|
|
|
|
* on RFO.
|
|
|
|
*/
|
2010-02-26 19:05:05 +08:00
|
|
|
[ C(OP_WRITE) ] = {
|
2011-03-03 10:34:48 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
2011-03-03 10:34:48 +08:00
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
2011-04-23 05:37:06 +08:00
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
},
|
2010-02-26 19:05:05 +08:00
|
|
|
};
|
|
|
|
|
2010-03-29 19:09:53 +08:00
|
|
|
static __initconst const u64 core2_hw_cache_event_ids
|
2010-02-26 19:05:05 +08:00
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
|
|
|
|
[ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2010-03-29 19:09:53 +08:00
|
|
|
static __initconst const u64 atom_hw_cache_event_ids
|
2010-02-26 19:05:05 +08:00
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
|
|
|
|
[ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2016-05-20 08:09:58 +08:00
|
|
|
EVENT_ATTR_STR(topdown-total-slots, td_total_slots_slm, "event=0x3c");
|
|
|
|
EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_slm, "2");
|
|
|
|
/* no_alloc_cycles.not_delivered */
|
|
|
|
EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_slm,
|
|
|
|
"event=0xca,umask=0x50");
|
|
|
|
EVENT_ATTR_STR(topdown-fetch-bubbles.scale, td_fetch_bubbles_scale_slm, "2");
|
|
|
|
/* uops_retired.all */
|
|
|
|
EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_slm,
|
|
|
|
"event=0xc2,umask=0x10");
|
|
|
|
/* uops_retired.all */
|
|
|
|
EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_slm,
|
|
|
|
"event=0xc2,umask=0x10");
|
|
|
|
|
|
|
|
static struct attribute *slm_events_attrs[] = {
|
|
|
|
EVENT_PTR(td_total_slots_slm),
|
|
|
|
EVENT_PTR(td_total_slots_scale_slm),
|
|
|
|
EVENT_PTR(td_fetch_bubbles_slm),
|
|
|
|
EVENT_PTR(td_fetch_bubbles_scale_slm),
|
|
|
|
EVENT_PTR(td_slots_issued_slm),
|
|
|
|
EVENT_PTR(td_slots_retired_slm),
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2013-07-18 17:02:24 +08:00
|
|
|
static struct extra_reg intel_slm_extra_regs[] __read_mostly =
|
|
|
|
{
|
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
perf/x86/intel: Fix Silvermont offcore masks
Fengguang Wu reported:
> sparse warnings: (new ones prefixed by >>)
>
> >> arch/x86/kernel/cpu/perf_event_intel.c:901:9: sparse: constant 0x768005ffff is so big it is long
> >> arch/x86/kernel/cpu/perf_event_intel.c:902:9: sparse: constant 0x768005ffff is so big it is long
>
> vim +901 arch/x86/kernel/cpu/perf_event_intel.c
>
> 895 },
> 896 };
> 897
> 898 static struct extra_reg intel_slm_extra_regs[] __read_mostly =
> 899 {
> 900 /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
> > 901 INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffff, RSP_0),
> > 902 INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffff, RSP_1),
> 903 EVENT_EXTRA_END
> 904 };
> 905
Extend those constants to 64 bits.
Reported-by: fengguang.wu@intel.com
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20130909112636.GQ31370@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-09 19:26:36 +08:00
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
|
2015-06-25 02:23:35 +08:00
|
|
|
INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
|
2013-07-18 17:02:24 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
|
|
|
#define SLM_DMND_READ SNB_DMND_DATA_RD
|
|
|
|
#define SLM_DMND_WRITE SNB_DMND_RFO
|
|
|
|
#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
|
|
|
|
|
|
|
|
#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
|
|
|
|
#define SLM_LLC_ACCESS SNB_RESP_ANY
|
|
|
|
#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 slm_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
|
2015-04-21 17:34:41 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0,
|
2013-07-18 17:02:24 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static __initconst const u64 slm_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2015-04-21 17:34:41 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0,
|
2013-07-18 17:02:24 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
|
2015-04-21 17:34:41 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */
|
2013-07-18 17:02:24 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
|
|
|
|
[ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2017-02-10 15:23:58 +08:00
|
|
|
EVENT_ATTR_STR(topdown-total-slots, td_total_slots_glm, "event=0x3c");
|
|
|
|
EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_glm, "3");
|
|
|
|
/* UOPS_NOT_DELIVERED.ANY */
|
|
|
|
EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_glm, "event=0x9c");
|
|
|
|
/* ISSUE_SLOTS_NOT_CONSUMED.RECOVERY */
|
|
|
|
EVENT_ATTR_STR(topdown-recovery-bubbles, td_recovery_bubbles_glm, "event=0xca,umask=0x02");
|
|
|
|
/* UOPS_RETIRED.ANY */
|
|
|
|
EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_glm, "event=0xc2");
|
|
|
|
/* UOPS_ISSUED.ANY */
|
|
|
|
EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_glm, "event=0x0e");
|
|
|
|
|
|
|
|
static struct attribute *glm_events_attrs[] = {
|
|
|
|
EVENT_PTR(td_total_slots_glm),
|
|
|
|
EVENT_PTR(td_total_slots_scale_glm),
|
|
|
|
EVENT_PTR(td_fetch_bubbles_glm),
|
|
|
|
EVENT_PTR(td_recovery_bubbles_glm),
|
|
|
|
EVENT_PTR(td_slots_issued_glm),
|
|
|
|
EVENT_PTR(td_slots_retired_glm),
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2016-04-15 15:42:47 +08:00
|
|
|
static struct extra_reg intel_glm_extra_regs[] __read_mostly = {
|
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x760005ffbfull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x360005ffbfull, RSP_1),
|
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
|
|
|
#define GLM_DEMAND_DATA_RD BIT_ULL(0)
|
|
|
|
#define GLM_DEMAND_RFO BIT_ULL(1)
|
|
|
|
#define GLM_ANY_RESPONSE BIT_ULL(16)
|
|
|
|
#define GLM_SNP_NONE_OR_MISS BIT_ULL(33)
|
|
|
|
#define GLM_DEMAND_READ GLM_DEMAND_DATA_RD
|
|
|
|
#define GLM_DEMAND_WRITE GLM_DEMAND_RFO
|
|
|
|
#define GLM_DEMAND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
|
|
|
|
#define GLM_LLC_ACCESS GLM_ANY_RESPONSE
|
|
|
|
#define GLM_SNP_ANY (GLM_SNP_NONE_OR_MISS|SNB_NO_FWD|SNB_HITM)
|
|
|
|
#define GLM_LLC_MISS (GLM_SNP_ANY|SNB_NON_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 glm_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
|
|
|
|
[C(L1D)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(L1I)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0380, /* ICACHE.ACCESSES */
|
|
|
|
[C(RESULT_MISS)] = 0x0280, /* ICACHE.MISSES */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(LL)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(DTLB)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(ITLB)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x00c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[C(RESULT_MISS)] = 0x0481, /* ITLB.MISS */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(BPU)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[C(RESULT_MISS)] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static __initconst const u64 glm_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
|
|
|
|
[C(LL)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = GLM_DEMAND_READ|
|
|
|
|
GLM_LLC_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = GLM_DEMAND_READ|
|
|
|
|
GLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = GLM_DEMAND_WRITE|
|
|
|
|
GLM_LLC_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = GLM_DEMAND_WRITE|
|
|
|
|
GLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = GLM_DEMAND_PREFETCH|
|
|
|
|
GLM_LLC_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = GLM_DEMAND_PREFETCH|
|
|
|
|
GLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2017-07-12 21:44:23 +08:00
|
|
|
static __initconst const u64 glp_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
|
|
|
|
[C(L1D)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(L1I)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0380, /* ICACHE.ACCESSES */
|
|
|
|
[C(RESULT_MISS)] = 0x0280, /* ICACHE.MISSES */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(LL)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(DTLB)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
|
|
|
|
[C(RESULT_MISS)] = 0xe08, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
|
|
|
|
[C(RESULT_MISS)] = 0xe49, /* DTLB_STORE_MISSES.WALK_COMPLETED */
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(ITLB)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x00c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[C(RESULT_MISS)] = 0x0481, /* ITLB.MISS */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(BPU)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[C(RESULT_MISS)] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static __initconst const u64 glp_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
|
|
|
|
[C(LL)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = GLM_DEMAND_READ|
|
|
|
|
GLM_LLC_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = GLM_DEMAND_READ|
|
|
|
|
GLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = GLM_DEMAND_WRITE|
|
|
|
|
GLM_LLC_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = GLM_DEMAND_WRITE|
|
|
|
|
GLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2019-04-11 02:57:09 +08:00
|
|
|
#define TNT_LOCAL_DRAM BIT_ULL(26)
|
|
|
|
#define TNT_DEMAND_READ GLM_DEMAND_DATA_RD
|
|
|
|
#define TNT_DEMAND_WRITE GLM_DEMAND_RFO
|
|
|
|
#define TNT_LLC_ACCESS GLM_ANY_RESPONSE
|
|
|
|
#define TNT_SNP_ANY (SNB_SNP_NOT_NEEDED|SNB_SNP_MISS| \
|
|
|
|
SNB_NO_FWD|SNB_SNP_FWD|SNB_HITM)
|
|
|
|
#define TNT_LLC_MISS (TNT_SNP_ANY|SNB_NON_DRAM|TNT_LOCAL_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 tnt_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
|
|
|
|
[C(LL)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = TNT_DEMAND_READ|
|
|
|
|
TNT_LLC_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = TNT_DEMAND_READ|
|
|
|
|
TNT_LLC_MISS,
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = TNT_DEMAND_WRITE|
|
|
|
|
TNT_LLC_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = TNT_DEMAND_WRITE|
|
|
|
|
TNT_LLC_MISS,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2020-12-09 04:05:52 +08:00
|
|
|
EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_tnt, "event=0x71,umask=0x0");
|
|
|
|
EVENT_ATTR_STR(topdown-retiring, td_retiring_tnt, "event=0xc2,umask=0x0");
|
|
|
|
EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec_tnt, "event=0x73,umask=0x6");
|
|
|
|
EVENT_ATTR_STR(topdown-be-bound, td_be_bound_tnt, "event=0x74,umask=0x0");
|
|
|
|
|
|
|
|
static struct attribute *tnt_events_attrs[] = {
|
|
|
|
EVENT_PTR(td_fe_bound_tnt),
|
|
|
|
EVENT_PTR(td_retiring_tnt),
|
|
|
|
EVENT_PTR(td_bad_spec_tnt),
|
|
|
|
EVENT_PTR(td_be_bound_tnt),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2019-04-11 02:57:09 +08:00
|
|
|
static struct extra_reg intel_tnt_extra_regs[] __read_mostly = {
|
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
2020-05-01 20:54:42 +08:00
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff0ffffff9fffull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xff0ffffff9fffull, RSP_1),
|
2019-04-11 02:57:09 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2015-12-08 06:28:18 +08:00
|
|
|
#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
|
|
|
|
#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
|
|
|
|
#define KNL_MCDRAM_LOCAL BIT_ULL(21)
|
|
|
|
#define KNL_MCDRAM_FAR BIT_ULL(22)
|
|
|
|
#define KNL_DDR_LOCAL BIT_ULL(23)
|
|
|
|
#define KNL_DDR_FAR BIT_ULL(24)
|
|
|
|
#define KNL_DRAM_ANY (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
|
|
|
|
KNL_DDR_LOCAL | KNL_DDR_FAR)
|
|
|
|
#define KNL_L2_READ SLM_DMND_READ
|
|
|
|
#define KNL_L2_WRITE SLM_DMND_WRITE
|
|
|
|
#define KNL_L2_PREFETCH SLM_DMND_PREFETCH
|
|
|
|
#define KNL_L2_ACCESS SLM_LLC_ACCESS
|
|
|
|
#define KNL_L2_MISS (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
|
|
|
|
KNL_DRAM_ANY | SNB_SNP_ANY | \
|
|
|
|
SNB_NON_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 knl_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
|
|
|
|
[C(LL)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = 0,
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = KNL_L2_WRITE | KNL_L2_MISS,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = KNL_L2_PREFETCH | KNL_L2_MISS,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
/*
|
perf/x86/intel: Fix PEBS warning by only restoring active PMU in pmi
This patch tries to fix a PEBS warning found in my stress test. The
following perf command can easily trigger the pebs warning or spurious
NMI error on Skylake/Broadwell/Haswell platforms:
sudo perf record -e 'cpu/umask=0x04,event=0xc4/pp,cycles,branches,ref-cycles,cache-misses,cache-references' --call-graph fp -b -c1000 -a
Also the NMI watchdog must be enabled.
For this case, the events number is larger than counter number. So
perf has to do multiplexing.
In perf_mux_hrtimer_handler, it does perf_pmu_disable(), schedule out
old events, rotate_ctx, schedule in new events and finally
perf_pmu_enable().
If the old events include precise event, the MSR_IA32_PEBS_ENABLE
should be cleared when perf_pmu_disable(). The MSR_IA32_PEBS_ENABLE
should keep 0 until the perf_pmu_enable() is called and the new event is
precise event.
However, there is a corner case which could restore PEBS_ENABLE to
stale value during the above period. In perf_pmu_disable(), GLOBAL_CTRL
will be set to 0 to stop overflow and followed PMI. But there may be
pending PMI from an earlier overflow, which cannot be stopped. So even
GLOBAL_CTRL is cleared, the kernel still be possible to get PMI. At
the end of the PMI handler, __intel_pmu_enable_all() will be called,
which will restore the stale values if old events haven't scheduled
out.
Once the stale pebs value is set, it's impossible to be corrected if
the new events are non-precise. Because the pebs_enabled will be set
to 0. x86_pmu.enable_all() will ignore the MSR_IA32_PEBS_ENABLE
setting. As a result, the following NMI with stale PEBS_ENABLE
trigger pebs warning.
The pending PMI after enabled=0 will become harmless if the NMI handler
does not change the state. This patch checks cpuc->enabled in pmi and
only restore the state when PMU is active.
Here is the dump:
Call Trace:
<NMI> [<ffffffff813c3a2e>] dump_stack+0x63/0x85
[<ffffffff810a46f2>] warn_slowpath_common+0x82/0xc0
[<ffffffff810a483a>] warn_slowpath_null+0x1a/0x20
[<ffffffff8100fe2e>] intel_pmu_drain_pebs_nhm+0x2be/0x320
[<ffffffff8100caa9>] intel_pmu_handle_irq+0x279/0x460
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff811f290d>] ? vunmap_page_range+0x20d/0x330
[<ffffffff811f2f11>] ? unmap_kernel_range_noflush+0x11/0x20
[<ffffffff8148379f>] ? ghes_copy_tofrom_phys+0x10f/0x2a0
[<ffffffff814839c8>] ? ghes_read_estatus+0x98/0x170
[<ffffffff81005a7d>] perf_event_nmi_handler+0x2d/0x50
[<ffffffff810310b9>] nmi_handle+0x69/0x120
[<ffffffff810316f6>] default_do_nmi+0xe6/0x100
[<ffffffff810317f2>] do_nmi+0xe2/0x130
[<ffffffff817aea71>] end_repeat_nmi+0x1a/0x1e
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
<<EOE>> <IRQ> [<ffffffff81006df8>] ? x86_perf_event_set_period+0xd8/0x180
[<ffffffff81006eec>] x86_pmu_start+0x4c/0x100
[<ffffffff8100722d>] x86_pmu_enable+0x28d/0x300
[<ffffffff811994d7>] perf_pmu_enable.part.81+0x7/0x10
[<ffffffff8119cb70>] perf_mux_hrtimer_handler+0x200/0x280
[<ffffffff8119c970>] ? __perf_install_in_context+0xc0/0xc0
[<ffffffff8110f92d>] __hrtimer_run_queues+0xfd/0x280
[<ffffffff811100d8>] hrtimer_interrupt+0xa8/0x190
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81051bd8>] local_apic_timer_interrupt+0x38/0x60
[<ffffffff817af01d>] smp_apic_timer_interrupt+0x3d/0x50
[<ffffffff817ad15c>] apic_timer_interrupt+0x8c/0xa0
<EOI> [<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81123de5>] ? smp_call_function_single+0xd5/0x130
[<ffffffff81123ddb>] ? smp_call_function_single+0xcb/0x130
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff8119765a>] event_function_call+0x10a/0x120
[<ffffffff8119c660>] ? ctx_resched+0x90/0x90
[<ffffffff811971e0>] ? cpu_clock_event_read+0x30/0x30
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff8119772b>] _perf_event_enable+0x5b/0x70
[<ffffffff81197388>] perf_event_for_each_child+0x38/0xa0
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff811a0ffd>] perf_ioctl+0x12d/0x3c0
[<ffffffff8134d855>] ? selinux_file_ioctl+0x95/0x1e0
[<ffffffff8124a3a1>] do_vfs_ioctl+0xa1/0x5a0
[<ffffffff81036d29>] ? sched_clock+0x9/0x10
[<ffffffff8124a919>] SyS_ioctl+0x79/0x90
[<ffffffff817ac4b2>] entry_SYSCALL_64_fastpath+0x1a/0xa4
---[ end trace aef202839fe9a71d ]---
Uhhuh. NMI received for unknown reason 2d on CPU 2.
Do you have a strange power saving mode enabled?
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1457046448-6184-1-git-send-email-kan.liang@intel.com
[ Fixed various typos and other small details. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-03-04 07:07:28 +08:00
|
|
|
* Used from PMIs where the LBRs are already disabled.
|
|
|
|
*
|
|
|
|
* This function could be called consecutively. It is required to remain in
|
|
|
|
* disabled state if called consecutively.
|
|
|
|
*
|
|
|
|
* During consecutive calls, the same disable value will be written to related
|
2016-09-15 16:22:33 +08:00
|
|
|
* registers, so the PMU state remains unchanged.
|
|
|
|
*
|
|
|
|
* intel_bts events don't coexist with intel PMU's BTS events because of
|
|
|
|
* x86_add_exclusive(x86_lbr_exclusive_lbr); there's no need to keep them
|
|
|
|
* disabled around intel PMU's event batching etc, only inside the PMI handler.
|
2020-01-22 02:13:38 +08:00
|
|
|
*
|
|
|
|
* Avoid PEBS_ENABLE MSR access in PMIs.
|
|
|
|
* The GLOBAL_CTRL has been disabled. All the counters do not count anymore.
|
|
|
|
* It doesn't matter if the PEBS is enabled or not.
|
|
|
|
* Usually, the PEBS status are not changed in PMIs. It's unnecessary to
|
|
|
|
* access PEBS_ENABLE MSR in disable_all()/enable_all().
|
|
|
|
* However, there are some cases which may change PEBS status, e.g. PMI
|
|
|
|
* throttle. The PEBS_ENABLE should be updated where the status changes.
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
*/
|
|
|
|
static void __intel_pmu_disable_all(void)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
|
|
|
|
|
2012-06-21 02:46:33 +08:00
|
|
|
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
|
2010-02-26 19:05:05 +08:00
|
|
|
intel_pmu_disable_bts();
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pmu_disable_all(void)
|
|
|
|
{
|
|
|
|
__intel_pmu_disable_all();
|
2020-01-22 02:13:38 +08:00
|
|
|
intel_pmu_pebs_disable_all();
|
2010-03-03 19:02:30 +08:00
|
|
|
intel_pmu_lbr_disable_all();
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
static void __intel_pmu_enable_all(int added, bool pmi)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2021-04-12 22:30:45 +08:00
|
|
|
u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
intel_pmu_lbr_enable_all(pmi);
|
2011-10-05 20:01:21 +08:00
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
|
2021-04-12 22:30:45 +08:00
|
|
|
intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2012-06-21 02:46:33 +08:00
|
|
|
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
|
2010-02-26 19:05:05 +08:00
|
|
|
struct perf_event *event =
|
2012-06-21 02:46:33 +08:00
|
|
|
cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
|
2010-02-26 19:05:05 +08:00
|
|
|
|
|
|
|
if (WARN_ON_ONCE(!event))
|
|
|
|
return;
|
|
|
|
|
|
|
|
intel_pmu_enable_bts(event->hw.config);
|
2016-09-15 16:22:33 +08:00
|
|
|
}
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
static void intel_pmu_enable_all(int added)
|
|
|
|
{
|
2020-01-22 02:13:38 +08:00
|
|
|
intel_pmu_pebs_enable_all();
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
__intel_pmu_enable_all(added, false);
|
|
|
|
}
|
|
|
|
|
2010-03-26 21:08:44 +08:00
|
|
|
/*
|
|
|
|
* Workaround for:
|
|
|
|
* Intel Errata AAK100 (model 26)
|
|
|
|
* Intel Errata AAP53 (model 30)
|
2010-03-29 22:37:17 +08:00
|
|
|
* Intel Errata BD53 (model 44)
|
2010-03-26 21:08:44 +08:00
|
|
|
*
|
2010-08-06 13:39:08 +08:00
|
|
|
* The official story:
|
|
|
|
* These chips need to be 'reset' when adding counters by programming the
|
|
|
|
* magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
|
|
|
|
* in sequence on the same PMC or on different PMCs.
|
|
|
|
*
|
|
|
|
* In practise it appears some of these events do in fact count, and
|
2018-12-03 17:47:34 +08:00
|
|
|
* we need to program all 4 events.
|
2010-03-26 21:08:44 +08:00
|
|
|
*/
|
2010-08-06 13:39:08 +08:00
|
|
|
static void intel_pmu_nhm_workaround(void)
|
2010-03-26 21:08:44 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-08-06 13:39:08 +08:00
|
|
|
static const unsigned long nhm_magic[4] = {
|
|
|
|
0x4300B5,
|
|
|
|
0x4300D2,
|
|
|
|
0x4300B1,
|
|
|
|
0x4300B1
|
|
|
|
};
|
|
|
|
struct perf_event *event;
|
|
|
|
int i;
|
2010-03-26 21:08:44 +08:00
|
|
|
|
2010-08-06 13:39:08 +08:00
|
|
|
/*
|
|
|
|
* The Errata requires below steps:
|
|
|
|
* 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
|
|
|
|
* 2) Configure 4 PERFEVTSELx with the magic events and clear
|
|
|
|
* the corresponding PMCx;
|
|
|
|
* 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
|
|
|
|
* 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
|
|
|
|
* 5) Clear 4 pairs of ERFEVTSELx and PMCx;
|
|
|
|
*/
|
2010-03-26 21:08:44 +08:00
|
|
|
|
2010-08-06 13:39:08 +08:00
|
|
|
/*
|
|
|
|
* The real steps we choose are a little different from above.
|
|
|
|
* A) To reduce MSR operations, we don't run step 1) as they
|
|
|
|
* are already cleared before this function is called;
|
|
|
|
* B) Call x86_perf_event_update to save PMCx before configuring
|
|
|
|
* PERFEVTSELx with magic number;
|
|
|
|
* C) With step 5), we do clear only when the PERFEVTSELx is
|
|
|
|
* not used currently.
|
|
|
|
* D) Call x86_perf_event_set_period to restore PMCx;
|
|
|
|
*/
|
2010-03-26 21:08:44 +08:00
|
|
|
|
2010-08-06 13:39:08 +08:00
|
|
|
/* We always operate 4 pairs of PERF Counters */
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
event = cpuc->events[i];
|
|
|
|
if (event)
|
|
|
|
x86_perf_event_update(event);
|
|
|
|
}
|
2010-03-26 21:08:44 +08:00
|
|
|
|
2010-08-06 13:39:08 +08:00
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
|
|
|
|
wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
|
|
|
|
}
|
|
|
|
|
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
|
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
|
2010-03-26 21:08:44 +08:00
|
|
|
|
2010-08-06 13:39:08 +08:00
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
event = cpuc->events[i];
|
|
|
|
|
|
|
|
if (event) {
|
|
|
|
x86_perf_event_set_period(event);
|
2010-04-14 04:23:14 +08:00
|
|
|
__x86_pmu_enable_event(&event->hw,
|
2010-08-06 13:39:08 +08:00
|
|
|
ARCH_PERFMON_EVENTSEL_ENABLE);
|
|
|
|
} else
|
|
|
|
wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
|
2010-03-26 21:08:44 +08:00
|
|
|
}
|
2010-08-06 13:39:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pmu_nhm_enable_all(int added)
|
|
|
|
{
|
|
|
|
if (added)
|
|
|
|
intel_pmu_nhm_workaround();
|
2010-03-26 21:08:44 +08:00
|
|
|
intel_pmu_enable_all(added);
|
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:18 +08:00
|
|
|
static void intel_set_tfa(struct cpu_hw_events *cpuc, bool on)
|
|
|
|
{
|
|
|
|
u64 val = on ? MSR_TFA_RTM_FORCE_ABORT : 0;
|
|
|
|
|
|
|
|
if (cpuc->tfa_shadow != val) {
|
|
|
|
cpuc->tfa_shadow = val;
|
|
|
|
wrmsrl(MSR_TSX_FORCE_ABORT, val);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_tfa_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We're going to use PMC3, make sure TFA is set before we touch it.
|
|
|
|
*/
|
2019-03-14 16:47:18 +08:00
|
|
|
if (cntr == 3)
|
2019-03-06 05:23:18 +08:00
|
|
|
intel_set_tfa(cpuc, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_tfa_pmu_enable_all(int added)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we find PMC3 is no longer used when we enable the PMU, we can
|
|
|
|
* clear TFA.
|
|
|
|
*/
|
|
|
|
if (!test_bit(3, cpuc->active_mask))
|
|
|
|
intel_set_tfa(cpuc, false);
|
|
|
|
|
|
|
|
intel_pmu_enable_all(added);
|
|
|
|
}
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
static inline u64 intel_pmu_get_status(void)
|
|
|
|
{
|
|
|
|
u64 status;
|
|
|
|
|
|
|
|
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
|
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void intel_pmu_ack_status(u64 ack)
|
|
|
|
{
|
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
|
|
|
|
}
|
|
|
|
|
2020-06-13 16:09:47 +08:00
|
|
|
static inline bool event_is_checkpointed(struct perf_event *event)
|
|
|
|
{
|
|
|
|
return unlikely(event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void intel_set_masks(struct perf_event *event, int idx)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
|
|
|
|
if (event->attr.exclude_host)
|
|
|
|
__set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
|
|
|
|
if (event->attr.exclude_guest)
|
|
|
|
__set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
|
|
|
|
if (event_is_checkpointed(event))
|
|
|
|
__set_bit(idx, (unsigned long *)&cpuc->intel_cp_status);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void intel_clear_masks(struct perf_event *event, int idx)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2020-06-13 16:09:47 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
|
|
|
|
__clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
|
|
|
|
__clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
|
|
|
|
__clear_bit(idx, (unsigned long *)&cpuc->intel_cp_status);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pmu_disable_fixed(struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
2010-02-26 19:05:05 +08:00
|
|
|
u64 ctrl_val, mask;
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
int idx = hwc->idx;
|
2010-02-26 19:05:05 +08:00
|
|
|
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
if (is_topdown_idx(idx)) {
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When there are other active TopDown events,
|
|
|
|
* don't disable the fixed counter 3.
|
|
|
|
*/
|
|
|
|
if (*(u64 *)cpuc->active_mask & INTEL_PMC_OTHER_TOPDOWN_BITS(idx))
|
|
|
|
return;
|
|
|
|
idx = INTEL_PMC_IDX_FIXED_SLOTS;
|
|
|
|
}
|
2010-02-26 19:05:05 +08:00
|
|
|
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
intel_clear_masks(event, idx);
|
|
|
|
|
|
|
|
mask = 0xfULL << ((idx - INTEL_PMC_IDX_FIXED) * 4);
|
2010-02-26 19:05:05 +08:00
|
|
|
rdmsrl(hwc->config_base, ctrl_val);
|
|
|
|
ctrl_val &= ~mask;
|
2010-03-08 20:51:31 +08:00
|
|
|
wrmsrl(hwc->config_base, ctrl_val);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
static void intel_pmu_disable_event(struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-03-03 03:32:08 +08:00
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
2020-06-13 16:09:47 +08:00
|
|
|
int idx = hwc->idx;
|
2010-03-03 03:32:08 +08:00
|
|
|
|
perf/x86/intel: Use switch in intel_pmu_disable/enable_event
Currently, the if-else is used in the intel_pmu_disable/enable_event to
check the type of an event. It works well, but with more and more types
added later, e.g., perf metrics, compared to the switch statement, the
if-else may impair the readability of the code.
There is no harm to use the switch statement to replace the if-else
here. Also, some optimizing compilers may compile a switch statement
into a jump-table which is more efficient than if-else for a large
number of cases. The performance gain may not be observed for now,
because the number of cases is only 5, but the benefits may be observed
with more and more types added in the future.
Use switch to replace the if-else in the intel_pmu_disable/enable_event.
If the idx is invalid, print a warning.
For the case INTEL_PMC_IDX_FIXED_BTS in intel_pmu_disable_event, don't
need to check the event->attr.precise_ip. Use return for the case.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-7-kan.liang@linux.intel.com
2020-07-24 01:11:09 +08:00
|
|
|
switch (idx) {
|
|
|
|
case 0 ... INTEL_PMC_IDX_FIXED - 1:
|
2020-06-13 16:09:47 +08:00
|
|
|
intel_clear_masks(event, idx);
|
|
|
|
x86_pmu_disable_event(event);
|
perf/x86/intel: Use switch in intel_pmu_disable/enable_event
Currently, the if-else is used in the intel_pmu_disable/enable_event to
check the type of an event. It works well, but with more and more types
added later, e.g., perf metrics, compared to the switch statement, the
if-else may impair the readability of the code.
There is no harm to use the switch statement to replace the if-else
here. Also, some optimizing compilers may compile a switch statement
into a jump-table which is more efficient than if-else for a large
number of cases. The performance gain may not be observed for now,
because the number of cases is only 5, but the benefits may be observed
with more and more types added in the future.
Use switch to replace the if-else in the intel_pmu_disable/enable_event.
If the idx is invalid, print a warning.
For the case INTEL_PMC_IDX_FIXED_BTS in intel_pmu_disable_event, don't
need to check the event->attr.precise_ip. Use return for the case.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-7-kan.liang@linux.intel.com
2020-07-24 01:11:09 +08:00
|
|
|
break;
|
|
|
|
case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1:
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
|
2020-06-13 16:09:47 +08:00
|
|
|
intel_pmu_disable_fixed(event);
|
perf/x86/intel: Use switch in intel_pmu_disable/enable_event
Currently, the if-else is used in the intel_pmu_disable/enable_event to
check the type of an event. It works well, but with more and more types
added later, e.g., perf metrics, compared to the switch statement, the
if-else may impair the readability of the code.
There is no harm to use the switch statement to replace the if-else
here. Also, some optimizing compilers may compile a switch statement
into a jump-table which is more efficient than if-else for a large
number of cases. The performance gain may not be observed for now,
because the number of cases is only 5, but the benefits may be observed
with more and more types added in the future.
Use switch to replace the if-else in the intel_pmu_disable/enable_event.
If the idx is invalid, print a warning.
For the case INTEL_PMC_IDX_FIXED_BTS in intel_pmu_disable_event, don't
need to check the event->attr.precise_ip. Use return for the case.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-7-kan.liang@linux.intel.com
2020-07-24 01:11:09 +08:00
|
|
|
break;
|
|
|
|
case INTEL_PMC_IDX_FIXED_BTS:
|
2010-02-26 19:05:05 +08:00
|
|
|
intel_pmu_disable_bts();
|
|
|
|
intel_pmu_drain_bts_buffer();
|
perf/x86/intel: Use switch in intel_pmu_disable/enable_event
Currently, the if-else is used in the intel_pmu_disable/enable_event to
check the type of an event. It works well, but with more and more types
added later, e.g., perf metrics, compared to the switch statement, the
if-else may impair the readability of the code.
There is no harm to use the switch statement to replace the if-else
here. Also, some optimizing compilers may compile a switch statement
into a jump-table which is more efficient than if-else for a large
number of cases. The performance gain may not be observed for now,
because the number of cases is only 5, but the benefits may be observed
with more and more types added in the future.
Use switch to replace the if-else in the intel_pmu_disable/enable_event.
If the idx is invalid, print a warning.
For the case INTEL_PMC_IDX_FIXED_BTS in intel_pmu_disable_event, don't
need to check the event->attr.precise_ip. Use return for the case.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-7-kan.liang@linux.intel.com
2020-07-24 01:11:09 +08:00
|
|
|
return;
|
|
|
|
case INTEL_PMC_IDX_FIXED_VLBR:
|
2020-06-13 16:09:50 +08:00
|
|
|
intel_clear_masks(event, idx);
|
perf/x86/intel: Use switch in intel_pmu_disable/enable_event
Currently, the if-else is used in the intel_pmu_disable/enable_event to
check the type of an event. It works well, but with more and more types
added later, e.g., perf metrics, compared to the switch statement, the
if-else may impair the readability of the code.
There is no harm to use the switch statement to replace the if-else
here. Also, some optimizing compilers may compile a switch statement
into a jump-table which is more efficient than if-else for a large
number of cases. The performance gain may not be observed for now,
because the number of cases is only 5, but the benefits may be observed
with more and more types added in the future.
Use switch to replace the if-else in the intel_pmu_disable/enable_event.
If the idx is invalid, print a warning.
For the case INTEL_PMC_IDX_FIXED_BTS in intel_pmu_disable_event, don't
need to check the event->attr.precise_ip. Use return for the case.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-7-kan.liang@linux.intel.com
2020-07-24 01:11:09 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
intel_clear_masks(event, idx);
|
|
|
|
pr_warn("Failed to disable the event with invalid index %d\n",
|
|
|
|
idx);
|
|
|
|
return;
|
|
|
|
}
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2019-05-04 23:15:56 +08:00
|
|
|
/*
|
|
|
|
* Needs to be called after x86_pmu_disable_event,
|
|
|
|
* so we don't trigger the event without PEBS bit set.
|
|
|
|
*/
|
|
|
|
if (unlikely(event->attr.precise_ip))
|
|
|
|
intel_pmu_pebs_disable(event);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf/x86: Ensure perf_sched_cb_{inc,dec}() is only called from pmu::{add,del}()
Currently perf_sched_cb_{inc,dec}() are called from
pmu::{start,stop}(), which has the problem that this can happen from
NMI context, this is making it hard to optimize perf_pmu_sched_task().
Furthermore, we really only need this accounting on pmu::{add,del}(),
so doing it from pmu::{start,stop}() is doing more work than we really
need.
Introduce x86_pmu::{add,del}() and wire up the LBR and PEBS.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-07 00:02:43 +08:00
|
|
|
static void intel_pmu_del_event(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (needs_branch_stack(event))
|
|
|
|
intel_pmu_lbr_del(event);
|
|
|
|
if (event->attr.precise_ip)
|
|
|
|
intel_pmu_pebs_del(event);
|
|
|
|
}
|
|
|
|
|
2020-07-24 01:11:13 +08:00
|
|
|
static int icl_set_topdown_event_period(struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
|
|
|
s64 left = local64_read(&hwc->period_left);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The values in PERF_METRICS MSR are derived from fixed counter 3.
|
|
|
|
* Software should start both registers, PERF_METRICS and fixed
|
|
|
|
* counter 3, from zero.
|
|
|
|
* Clear PERF_METRICS and Fixed counter 3 in initialization.
|
|
|
|
* After that, both MSRs will be cleared for each read.
|
|
|
|
* Don't need to clear them again.
|
|
|
|
*/
|
|
|
|
if (left == x86_pmu.max_period) {
|
|
|
|
wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0);
|
|
|
|
wrmsrl(MSR_PERF_METRICS, 0);
|
2020-07-24 01:11:14 +08:00
|
|
|
hwc->saved_slots = 0;
|
|
|
|
hwc->saved_metric = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((hwc->saved_slots) && is_slots_event(event)) {
|
|
|
|
wrmsrl(MSR_CORE_PERF_FIXED_CTR3, hwc->saved_slots);
|
|
|
|
wrmsrl(MSR_PERF_METRICS, hwc->saved_metric);
|
2020-07-24 01:11:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
perf_event_update_userpage(event);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx)
|
|
|
|
{
|
|
|
|
u32 val;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The metric is reported as an 8bit integer fraction
|
|
|
|
* suming up to 0xff.
|
|
|
|
* slots-in-metric = (Metric / 0xff) * slots
|
|
|
|
*/
|
|
|
|
val = (metric >> ((idx - INTEL_PMC_IDX_METRIC_BASE) * 8)) & 0xff;
|
|
|
|
return mul_u64_u32_div(slots, val, 0xff);
|
|
|
|
}
|
|
|
|
|
2020-07-24 01:11:14 +08:00
|
|
|
static u64 icl_get_topdown_value(struct perf_event *event,
|
2020-07-24 01:11:13 +08:00
|
|
|
u64 slots, u64 metrics)
|
|
|
|
{
|
|
|
|
int idx = event->hw.idx;
|
|
|
|
u64 delta;
|
|
|
|
|
|
|
|
if (is_metric_idx(idx))
|
|
|
|
delta = icl_get_metrics_event_value(metrics, slots, idx);
|
|
|
|
else
|
|
|
|
delta = slots;
|
|
|
|
|
2020-07-24 01:11:14 +08:00
|
|
|
return delta;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __icl_update_topdown_event(struct perf_event *event,
|
|
|
|
u64 slots, u64 metrics,
|
|
|
|
u64 last_slots, u64 last_metrics)
|
|
|
|
{
|
|
|
|
u64 delta, last = 0;
|
|
|
|
|
|
|
|
delta = icl_get_topdown_value(event, slots, metrics);
|
|
|
|
if (last_slots)
|
|
|
|
last = icl_get_topdown_value(event, last_slots, last_metrics);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The 8bit integer fraction of metric may be not accurate,
|
|
|
|
* especially when the changes is very small.
|
|
|
|
* For example, if only a few bad_spec happens, the fraction
|
|
|
|
* may be reduced from 1 to 0. If so, the bad_spec event value
|
|
|
|
* will be 0 which is definitely less than the last value.
|
|
|
|
* Avoid update event->count for this case.
|
|
|
|
*/
|
|
|
|
if (delta > last) {
|
|
|
|
delta -= last;
|
|
|
|
local64_add(delta, &event->count);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-01-29 06:40:08 +08:00
|
|
|
static void update_saved_topdown_regs(struct perf_event *event, u64 slots,
|
|
|
|
u64 metrics, int metric_end)
|
2020-07-24 01:11:14 +08:00
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
struct perf_event *other;
|
|
|
|
int idx;
|
|
|
|
|
|
|
|
event->hw.saved_slots = slots;
|
|
|
|
event->hw.saved_metric = metrics;
|
|
|
|
|
2021-01-29 06:40:08 +08:00
|
|
|
for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
|
2020-07-24 01:11:14 +08:00
|
|
|
if (!is_topdown_idx(idx))
|
|
|
|
continue;
|
|
|
|
other = cpuc->events[idx];
|
|
|
|
other->hw.saved_slots = slots;
|
|
|
|
other->hw.saved_metric = metrics;
|
|
|
|
}
|
2020-07-24 01:11:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update all active Topdown events.
|
|
|
|
*
|
|
|
|
* The PERF_METRICS and Fixed counter 3 are read separately. The values may be
|
|
|
|
* modify by a NMI. PMU has to be disabled before calling this function.
|
|
|
|
*/
|
2021-01-29 06:40:08 +08:00
|
|
|
|
|
|
|
static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
|
2020-07-24 01:11:13 +08:00
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
struct perf_event *other;
|
|
|
|
u64 slots, metrics;
|
2020-07-24 01:11:14 +08:00
|
|
|
bool reset = true;
|
2020-07-24 01:11:13 +08:00
|
|
|
int idx;
|
|
|
|
|
|
|
|
/* read Fixed counter 3 */
|
|
|
|
rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots);
|
|
|
|
if (!slots)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* read PERF_METRICS */
|
|
|
|
rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics);
|
|
|
|
|
2021-01-29 06:40:08 +08:00
|
|
|
for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
|
2020-07-24 01:11:13 +08:00
|
|
|
if (!is_topdown_idx(idx))
|
|
|
|
continue;
|
|
|
|
other = cpuc->events[idx];
|
2020-07-24 01:11:14 +08:00
|
|
|
__icl_update_topdown_event(other, slots, metrics,
|
|
|
|
event ? event->hw.saved_slots : 0,
|
|
|
|
event ? event->hw.saved_metric : 0);
|
2020-07-24 01:11:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check and update this event, which may have been cleared
|
|
|
|
* in active_mask e.g. x86_pmu_stop()
|
|
|
|
*/
|
2020-07-24 01:11:14 +08:00
|
|
|
if (event && !test_bit(event->hw.idx, cpuc->active_mask)) {
|
|
|
|
__icl_update_topdown_event(event, slots, metrics,
|
|
|
|
event->hw.saved_slots,
|
|
|
|
event->hw.saved_metric);
|
2020-07-24 01:11:13 +08:00
|
|
|
|
2020-07-24 01:11:14 +08:00
|
|
|
/*
|
|
|
|
* In x86_pmu_stop(), the event is cleared in active_mask first,
|
|
|
|
* then drain the delta, which indicates context switch for
|
|
|
|
* counting.
|
|
|
|
* Save metric and slots for context switch.
|
|
|
|
* Don't need to reset the PERF_METRICS and Fixed counter 3.
|
|
|
|
* Because the values will be restored in next schedule in.
|
|
|
|
*/
|
2021-01-29 06:40:08 +08:00
|
|
|
update_saved_topdown_regs(event, slots, metrics, metric_end);
|
2020-07-24 01:11:14 +08:00
|
|
|
reset = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (reset) {
|
|
|
|
/* The fixed counter 3 has to be written before the PERF_METRICS. */
|
|
|
|
wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0);
|
|
|
|
wrmsrl(MSR_PERF_METRICS, 0);
|
|
|
|
if (event)
|
2021-01-29 06:40:08 +08:00
|
|
|
update_saved_topdown_regs(event, 0, 0, metric_end);
|
2020-07-24 01:11:14 +08:00
|
|
|
}
|
2020-07-24 01:11:13 +08:00
|
|
|
|
|
|
|
return slots;
|
|
|
|
}
|
|
|
|
|
2021-01-29 06:40:08 +08:00
|
|
|
static u64 icl_update_topdown_event(struct perf_event *event)
|
|
|
|
{
|
2021-01-29 06:40:09 +08:00
|
|
|
return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE +
|
|
|
|
x86_pmu.num_topdown_events - 1);
|
2021-01-29 06:40:08 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
static void intel_pmu_read_topdown_event(struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
|
|
|
|
/* Only need to call update_topdown_event() once for group read. */
|
|
|
|
if ((cpuc->txn_flags & PERF_PMU_TXN_READ) &&
|
|
|
|
!is_slots_event(event))
|
|
|
|
return;
|
|
|
|
|
|
|
|
perf_pmu_disable(event->pmu);
|
|
|
|
x86_pmu.update_topdown_event(event);
|
|
|
|
perf_pmu_enable(event->pmu);
|
|
|
|
}
|
|
|
|
|
2018-02-13 06:20:34 +08:00
|
|
|
static void intel_pmu_read_event(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
|
|
|
|
intel_pmu_auto_reload_read(event);
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
else if (is_topdown_count(event) && x86_pmu.update_topdown_event)
|
|
|
|
intel_pmu_read_topdown_event(event);
|
2018-02-13 06:20:34 +08:00
|
|
|
else
|
|
|
|
x86_perf_event_update(event);
|
|
|
|
}
|
|
|
|
|
2018-03-09 10:15:40 +08:00
|
|
|
static void intel_pmu_enable_fixed(struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2018-03-09 10:15:40 +08:00
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
|
|
|
u64 ctrl_val, mask, bits = 0;
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
int idx = hwc->idx;
|
|
|
|
|
|
|
|
if (is_topdown_idx(idx)) {
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
/*
|
|
|
|
* When there are other active TopDown events,
|
|
|
|
* don't enable the fixed counter 3 again.
|
|
|
|
*/
|
|
|
|
if (*(u64 *)cpuc->active_mask & INTEL_PMC_OTHER_TOPDOWN_BITS(idx))
|
|
|
|
return;
|
|
|
|
|
|
|
|
idx = INTEL_PMC_IDX_FIXED_SLOTS;
|
|
|
|
}
|
|
|
|
|
|
|
|
intel_set_masks(event, idx);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
|
|
|
/*
|
2018-03-09 10:15:40 +08:00
|
|
|
* Enable IRQ generation (0x8), if not PEBS,
|
2010-02-26 19:05:05 +08:00
|
|
|
* and enable ring-3 counting (0x2) and ring-0 counting (0x1)
|
|
|
|
* if requested:
|
|
|
|
*/
|
2018-03-09 10:15:40 +08:00
|
|
|
if (!event->attr.precise_ip)
|
|
|
|
bits |= 0x8;
|
2010-02-26 19:05:05 +08:00
|
|
|
if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
|
|
|
|
bits |= 0x2;
|
|
|
|
if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
|
|
|
|
bits |= 0x1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ANY bit is supported in v3 and up
|
|
|
|
*/
|
|
|
|
if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
|
|
|
|
bits |= 0x4;
|
|
|
|
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
idx -= INTEL_PMC_IDX_FIXED;
|
2010-02-26 19:05:05 +08:00
|
|
|
bits <<= (idx * 4);
|
|
|
|
mask = 0xfULL << (idx * 4);
|
|
|
|
|
perf/x86/intel: Support adaptive PEBS v4
Adaptive PEBS is a new way to report PEBS sampling information. Instead
of a fixed size record for all PEBS events it allows to configure the
PEBS record to only include the information needed. Events can then opt
in to use such an extended record, or stay with a basic record which
only contains the IP.
The major new feature is to support LBRs in PEBS record.
Besides normal LBR, this allows (much faster) large PEBS, while still
supporting callstacks through callstack LBR. So essentially a lot of
profiling can now be done without frequent interrupts, dropping the
overhead significantly.
The main requirement still is to use a period, and not use frequency
mode, because frequency mode requires reevaluating the frequency on each
overflow.
The floating point state (XMM) is also supported, which allows efficient
profiling of FP function arguments.
Introduce specific drain function to handle variable length records.
Use a new callback to parse the new record format, and also handle the
STATUS field now being at a different offset.
Add code to set up the configuration register. Since there is only a
single register, all events either get the full super set of all events,
or only the basic record.
Originally-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Cc: jolsa@kernel.org
Link: https://lkml.kernel.org/r/20190402194509.2832-6-kan.liang@linux.intel.com
[ Renamed GPRS => GP. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-04-03 03:45:02 +08:00
|
|
|
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) {
|
|
|
|
bits |= ICL_FIXED_0_ADAPTIVE << (idx * 4);
|
|
|
|
mask |= ICL_FIXED_0_ADAPTIVE << (idx * 4);
|
|
|
|
}
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
rdmsrl(hwc->config_base, ctrl_val);
|
|
|
|
ctrl_val &= ~mask;
|
|
|
|
ctrl_val |= bits;
|
2010-03-08 20:51:31 +08:00
|
|
|
wrmsrl(hwc->config_base, ctrl_val);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
2010-03-03 03:32:08 +08:00
|
|
|
static void intel_pmu_enable_event(struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-03-03 03:32:08 +08:00
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
2020-06-13 16:09:47 +08:00
|
|
|
int idx = hwc->idx;
|
2013-09-12 18:53:44 +08:00
|
|
|
|
2018-03-09 10:15:40 +08:00
|
|
|
if (unlikely(event->attr.precise_ip))
|
|
|
|
intel_pmu_pebs_enable(event);
|
|
|
|
|
perf/x86/intel: Use switch in intel_pmu_disable/enable_event
Currently, the if-else is used in the intel_pmu_disable/enable_event to
check the type of an event. It works well, but with more and more types
added later, e.g., perf metrics, compared to the switch statement, the
if-else may impair the readability of the code.
There is no harm to use the switch statement to replace the if-else
here. Also, some optimizing compilers may compile a switch statement
into a jump-table which is more efficient than if-else for a large
number of cases. The performance gain may not be observed for now,
because the number of cases is only 5, but the benefits may be observed
with more and more types added in the future.
Use switch to replace the if-else in the intel_pmu_disable/enable_event.
If the idx is invalid, print a warning.
For the case INTEL_PMC_IDX_FIXED_BTS in intel_pmu_disable_event, don't
need to check the event->attr.precise_ip. Use return for the case.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-7-kan.liang@linux.intel.com
2020-07-24 01:11:09 +08:00
|
|
|
switch (idx) {
|
|
|
|
case 0 ... INTEL_PMC_IDX_FIXED - 1:
|
2020-06-13 16:09:47 +08:00
|
|
|
intel_set_masks(event, idx);
|
|
|
|
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
|
perf/x86/intel: Use switch in intel_pmu_disable/enable_event
Currently, the if-else is used in the intel_pmu_disable/enable_event to
check the type of an event. It works well, but with more and more types
added later, e.g., perf metrics, compared to the switch statement, the
if-else may impair the readability of the code.
There is no harm to use the switch statement to replace the if-else
here. Also, some optimizing compilers may compile a switch statement
into a jump-table which is more efficient than if-else for a large
number of cases. The performance gain may not be observed for now,
because the number of cases is only 5, but the benefits may be observed
with more and more types added in the future.
Use switch to replace the if-else in the intel_pmu_disable/enable_event.
If the idx is invalid, print a warning.
For the case INTEL_PMC_IDX_FIXED_BTS in intel_pmu_disable_event, don't
need to check the event->attr.precise_ip. Use return for the case.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-7-kan.liang@linux.intel.com
2020-07-24 01:11:09 +08:00
|
|
|
break;
|
|
|
|
case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1:
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
|
2018-03-09 10:15:40 +08:00
|
|
|
intel_pmu_enable_fixed(event);
|
perf/x86/intel: Use switch in intel_pmu_disable/enable_event
Currently, the if-else is used in the intel_pmu_disable/enable_event to
check the type of an event. It works well, but with more and more types
added later, e.g., perf metrics, compared to the switch statement, the
if-else may impair the readability of the code.
There is no harm to use the switch statement to replace the if-else
here. Also, some optimizing compilers may compile a switch statement
into a jump-table which is more efficient than if-else for a large
number of cases. The performance gain may not be observed for now,
because the number of cases is only 5, but the benefits may be observed
with more and more types added in the future.
Use switch to replace the if-else in the intel_pmu_disable/enable_event.
If the idx is invalid, print a warning.
For the case INTEL_PMC_IDX_FIXED_BTS in intel_pmu_disable_event, don't
need to check the event->attr.precise_ip. Use return for the case.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-7-kan.liang@linux.intel.com
2020-07-24 01:11:09 +08:00
|
|
|
break;
|
|
|
|
case INTEL_PMC_IDX_FIXED_BTS:
|
2020-06-13 16:09:47 +08:00
|
|
|
if (!__this_cpu_read(cpu_hw_events.enabled))
|
|
|
|
return;
|
|
|
|
intel_pmu_enable_bts(hwc->config);
|
perf/x86/intel: Use switch in intel_pmu_disable/enable_event
Currently, the if-else is used in the intel_pmu_disable/enable_event to
check the type of an event. It works well, but with more and more types
added later, e.g., perf metrics, compared to the switch statement, the
if-else may impair the readability of the code.
There is no harm to use the switch statement to replace the if-else
here. Also, some optimizing compilers may compile a switch statement
into a jump-table which is more efficient than if-else for a large
number of cases. The performance gain may not be observed for now,
because the number of cases is only 5, but the benefits may be observed
with more and more types added in the future.
Use switch to replace the if-else in the intel_pmu_disable/enable_event.
If the idx is invalid, print a warning.
For the case INTEL_PMC_IDX_FIXED_BTS in intel_pmu_disable_event, don't
need to check the event->attr.precise_ip. Use return for the case.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-7-kan.liang@linux.intel.com
2020-07-24 01:11:09 +08:00
|
|
|
break;
|
|
|
|
case INTEL_PMC_IDX_FIXED_VLBR:
|
2020-06-13 16:09:50 +08:00
|
|
|
intel_set_masks(event, idx);
|
perf/x86/intel: Use switch in intel_pmu_disable/enable_event
Currently, the if-else is used in the intel_pmu_disable/enable_event to
check the type of an event. It works well, but with more and more types
added later, e.g., perf metrics, compared to the switch statement, the
if-else may impair the readability of the code.
There is no harm to use the switch statement to replace the if-else
here. Also, some optimizing compilers may compile a switch statement
into a jump-table which is more efficient than if-else for a large
number of cases. The performance gain may not be observed for now,
because the number of cases is only 5, but the benefits may be observed
with more and more types added in the future.
Use switch to replace the if-else in the intel_pmu_disable/enable_event.
If the idx is invalid, print a warning.
For the case INTEL_PMC_IDX_FIXED_BTS in intel_pmu_disable_event, don't
need to check the event->attr.precise_ip. Use return for the case.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-7-kan.liang@linux.intel.com
2020-07-24 01:11:09 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
pr_warn("Failed to enable the event with invalid index %d\n",
|
|
|
|
idx);
|
|
|
|
}
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf/x86: Ensure perf_sched_cb_{inc,dec}() is only called from pmu::{add,del}()
Currently perf_sched_cb_{inc,dec}() are called from
pmu::{start,stop}(), which has the problem that this can happen from
NMI context, this is making it hard to optimize perf_pmu_sched_task().
Furthermore, we really only need this accounting on pmu::{add,del}(),
so doing it from pmu::{start,stop}() is doing more work than we really
need.
Introduce x86_pmu::{add,del}() and wire up the LBR and PEBS.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-07 00:02:43 +08:00
|
|
|
static void intel_pmu_add_event(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (event->attr.precise_ip)
|
|
|
|
intel_pmu_pebs_add(event);
|
|
|
|
if (needs_branch_stack(event))
|
|
|
|
intel_pmu_lbr_add(event);
|
|
|
|
}
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
|
|
|
* Save and restart an expired event. Called by NMI contexts,
|
|
|
|
* so it has to be careful about preempting normal event ops:
|
|
|
|
*/
|
2011-08-31 07:41:05 +08:00
|
|
|
int intel_pmu_save_and_restart(struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-03-03 03:18:39 +08:00
|
|
|
x86_perf_event_update(event);
|
2013-09-06 11:37:38 +08:00
|
|
|
/*
|
|
|
|
* For a checkpointed counter always reset back to 0. This
|
|
|
|
* avoids a situation where the counter overflows, aborts the
|
|
|
|
* transaction and is then set back to shortly before the
|
|
|
|
* overflow, and overflows and aborts again.
|
|
|
|
*/
|
|
|
|
if (unlikely(event_is_checkpointed(event))) {
|
|
|
|
/* No race with NMIs because the counter should not be armed */
|
|
|
|
wrmsrl(event->hw.event_base, 0);
|
|
|
|
local64_set(&event->hw.prev_count, 0);
|
|
|
|
}
|
2010-03-03 03:18:39 +08:00
|
|
|
return x86_perf_event_set_period(event);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pmu_reset(void)
|
|
|
|
{
|
2010-12-18 23:28:55 +08:00
|
|
|
struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
|
2021-04-12 22:30:45 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2021-04-12 22:30:46 +08:00
|
|
|
int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
|
|
|
|
int num_counters = hybrid(cpuc->pmu, num_counters);
|
2010-02-26 19:05:05 +08:00
|
|
|
unsigned long flags;
|
|
|
|
int idx;
|
|
|
|
|
2021-04-12 22:30:46 +08:00
|
|
|
if (!num_counters)
|
2010-02-26 19:05:05 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2021-04-12 22:30:46 +08:00
|
|
|
for (idx = 0; idx < num_counters; idx++) {
|
2012-06-08 04:32:04 +08:00
|
|
|
wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
|
|
|
|
wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
2021-04-12 22:30:46 +08:00
|
|
|
for (idx = 0; idx < num_counters_fixed; idx++) {
|
2021-04-12 22:30:45 +08:00
|
|
|
if (fixed_counter_disabled(idx, cpuc->pmu))
|
2021-01-29 06:40:11 +08:00
|
|
|
continue;
|
2012-06-08 04:32:04 +08:00
|
|
|
wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
|
2021-01-29 06:40:11 +08:00
|
|
|
}
|
2010-03-30 00:36:50 +08:00
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
if (ds)
|
|
|
|
ds->bts_index = ds->bts_buffer_base;
|
|
|
|
|
2015-02-28 01:48:30 +08:00
|
|
|
/* Ack all overflows and disable fixed counters */
|
|
|
|
if (x86_pmu.version >= 2) {
|
|
|
|
intel_pmu_ack_status(intel_pmu_get_status());
|
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Reset LBRs and LBR freezing */
|
|
|
|
if (x86_pmu.lbr_nr) {
|
|
|
|
update_debugctlmsr(get_debugctlmsr() &
|
|
|
|
~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
|
|
|
|
}
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
|
2018-08-08 15:12:06 +08:00
|
|
|
static int handle_pmi_common(struct pt_regs *regs, u64 status)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
|
|
|
struct perf_sample_data data;
|
2018-08-08 15:12:06 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
int bit;
|
|
|
|
int handled = 0;
|
2021-04-12 22:30:45 +08:00
|
|
|
u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
|
|
|
inc_irq_stat(apic_perf_irqs);
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
perf/x86/intel: ignore CondChgd bit to avoid false NMI handling
Currently, any NMI is falsely handled by a NMI handler of NMI watchdog
if CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR is set.
For example, we use external NMI to make system panic to get crash
dump, but in this case, the external NMI is falsely handled do to the
issue.
This commit deals with the issue simply by ignoring CondChgd bit.
Here is explanation in detail.
On x86 NMI watchdog uses performance monitoring feature to
periodically signal NMI each time performance counter gets overflowed.
intel_pmu_handle_irq() is called as a NMI_LOCAL handler from a NMI
handler of NMI watchdog, perf_event_nmi_handler(). It identifies an
owner of a given NMI by looking at overflow status bits in
MSR_CORE_PERF_GLOBAL_STATUS MSR. If some of the bits are set, then it
handles the given NMI as its own NMI.
The problem is that the intel_pmu_handle_irq() doesn't distinguish
CondChgd bit from other bits. Unlike the other status bits, CondChgd
bit doesn't represent overflow status for performance counters. Thus,
CondChgd bit cannot be thought of as a mark indicating a given NMI is
NMI watchdog's.
As a result, if CondChgd bit is set, any NMI is falsely handled by the
NMI handler of NMI watchdog. Also, if type of the falsely handled NMI
is either NMI_UNKNOWN, NMI_SERR or NMI_IO_CHECK, the corresponding
action is never performed until CondChgd bit is cleared.
I noticed this behavior on systems with Ivy Bridge processors: Intel
Xeon CPU E5-2630 v2 and Intel Xeon CPU E7-8890 v2. On both systems,
CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR has already been set
in the beginning at boot. Then the CondChgd bit is immediately cleared
by next wrmsr to MSR_CORE_PERF_GLOBAL_CTRL MSR and appears to remain
0.
On the other hand, on older processors such as Nehalem, Xeon E7540,
CondChgd bit is not set in the beginning at boot.
I'm not sure about exact behavior of CondChgd bit, in particular when
this bit is set. Although I read Intel System Programmer's Manual to
figure out that, the descriptions I found are:
In 18.9.1:
"The MSR_PERF_GLOBAL_STATUS MSR also provides a ¡sticky bit¢ to
indicate changes to the state of performancmonitoring hardware"
In Table 35-2 IA-32 Architectural MSRs
63 CondChg: status bits of this register has changed.
These are different from the bahviour I see on the actual system as I
explained above.
At least, I think ignoring CondChgd bit should be enough for NMI
watchdog perspective.
Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20140625.103503.409316067.d.hatayama@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-06-25 09:09:07 +08:00
|
|
|
/*
|
2015-05-11 03:22:45 +08:00
|
|
|
* Ignore a range of extra bits in status that do not indicate
|
|
|
|
* overflow by themselves.
|
perf/x86/intel: ignore CondChgd bit to avoid false NMI handling
Currently, any NMI is falsely handled by a NMI handler of NMI watchdog
if CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR is set.
For example, we use external NMI to make system panic to get crash
dump, but in this case, the external NMI is falsely handled do to the
issue.
This commit deals with the issue simply by ignoring CondChgd bit.
Here is explanation in detail.
On x86 NMI watchdog uses performance monitoring feature to
periodically signal NMI each time performance counter gets overflowed.
intel_pmu_handle_irq() is called as a NMI_LOCAL handler from a NMI
handler of NMI watchdog, perf_event_nmi_handler(). It identifies an
owner of a given NMI by looking at overflow status bits in
MSR_CORE_PERF_GLOBAL_STATUS MSR. If some of the bits are set, then it
handles the given NMI as its own NMI.
The problem is that the intel_pmu_handle_irq() doesn't distinguish
CondChgd bit from other bits. Unlike the other status bits, CondChgd
bit doesn't represent overflow status for performance counters. Thus,
CondChgd bit cannot be thought of as a mark indicating a given NMI is
NMI watchdog's.
As a result, if CondChgd bit is set, any NMI is falsely handled by the
NMI handler of NMI watchdog. Also, if type of the falsely handled NMI
is either NMI_UNKNOWN, NMI_SERR or NMI_IO_CHECK, the corresponding
action is never performed until CondChgd bit is cleared.
I noticed this behavior on systems with Ivy Bridge processors: Intel
Xeon CPU E5-2630 v2 and Intel Xeon CPU E7-8890 v2. On both systems,
CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR has already been set
in the beginning at boot. Then the CondChgd bit is immediately cleared
by next wrmsr to MSR_CORE_PERF_GLOBAL_CTRL MSR and appears to remain
0.
On the other hand, on older processors such as Nehalem, Xeon E7540,
CondChgd bit is not set in the beginning at boot.
I'm not sure about exact behavior of CondChgd bit, in particular when
this bit is set. Although I read Intel System Programmer's Manual to
figure out that, the descriptions I found are:
In 18.9.1:
"The MSR_PERF_GLOBAL_STATUS MSR also provides a ¡sticky bit¢ to
indicate changes to the state of performancmonitoring hardware"
In Table 35-2 IA-32 Architectural MSRs
63 CondChg: status bits of this register has changed.
These are different from the bahviour I see on the actual system as I
explained above.
At least, I think ignoring CondChgd bit should be enough for NMI
watchdog perspective.
Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20140625.103503.409316067.d.hatayama@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-06-25 09:09:07 +08:00
|
|
|
*/
|
2015-05-11 03:22:45 +08:00
|
|
|
status &= ~(GLOBAL_STATUS_COND_CHG |
|
|
|
|
GLOBAL_STATUS_ASIF |
|
|
|
|
GLOBAL_STATUS_LBRS_FROZEN);
|
|
|
|
if (!status)
|
2018-08-08 15:12:06 +08:00
|
|
|
return 0;
|
2016-12-22 16:29:26 +08:00
|
|
|
/*
|
|
|
|
* In case multiple PEBS events are sampled at the same time,
|
|
|
|
* it is possible to have GLOBAL_STATUS bit 62 set indicating
|
|
|
|
* PEBS buffer overflow and also seeing at most 3 PEBS counters
|
|
|
|
* having their bits set in the status register. This is a sign
|
|
|
|
* that there was at least one PEBS record pending at the time
|
|
|
|
* of the PMU interrupt. PEBS counters must only be processed
|
|
|
|
* via the drain_pebs() calls and not via the regular sample
|
|
|
|
* processing loop coming after that the function, otherwise
|
|
|
|
* phony regular samples may be generated in the sampling buffer
|
|
|
|
* not marked with the EXACT tag. Another possibility is to have
|
|
|
|
* one PEBS event and at least one non-PEBS event whic hoverflows
|
|
|
|
* while PEBS has armed. In this case, bit 62 of GLOBAL_STATUS will
|
|
|
|
* not be set, yet the overflow status bit for the PEBS counter will
|
|
|
|
* be on Skylake.
|
|
|
|
*
|
|
|
|
* To avoid this problem, we systematically ignore the PEBS-enabled
|
|
|
|
* counters from the GLOBAL_STATUS mask and we always process PEBS
|
|
|
|
* events via drain_pebs().
|
|
|
|
*/
|
2018-03-09 10:15:41 +08:00
|
|
|
if (x86_pmu.flags & PMU_FL_PEBS_ALL)
|
|
|
|
status &= ~cpuc->pebs_enabled;
|
|
|
|
else
|
|
|
|
status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
|
perf/x86/intel: ignore CondChgd bit to avoid false NMI handling
Currently, any NMI is falsely handled by a NMI handler of NMI watchdog
if CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR is set.
For example, we use external NMI to make system panic to get crash
dump, but in this case, the external NMI is falsely handled do to the
issue.
This commit deals with the issue simply by ignoring CondChgd bit.
Here is explanation in detail.
On x86 NMI watchdog uses performance monitoring feature to
periodically signal NMI each time performance counter gets overflowed.
intel_pmu_handle_irq() is called as a NMI_LOCAL handler from a NMI
handler of NMI watchdog, perf_event_nmi_handler(). It identifies an
owner of a given NMI by looking at overflow status bits in
MSR_CORE_PERF_GLOBAL_STATUS MSR. If some of the bits are set, then it
handles the given NMI as its own NMI.
The problem is that the intel_pmu_handle_irq() doesn't distinguish
CondChgd bit from other bits. Unlike the other status bits, CondChgd
bit doesn't represent overflow status for performance counters. Thus,
CondChgd bit cannot be thought of as a mark indicating a given NMI is
NMI watchdog's.
As a result, if CondChgd bit is set, any NMI is falsely handled by the
NMI handler of NMI watchdog. Also, if type of the falsely handled NMI
is either NMI_UNKNOWN, NMI_SERR or NMI_IO_CHECK, the corresponding
action is never performed until CondChgd bit is cleared.
I noticed this behavior on systems with Ivy Bridge processors: Intel
Xeon CPU E5-2630 v2 and Intel Xeon CPU E7-8890 v2. On both systems,
CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR has already been set
in the beginning at boot. Then the CondChgd bit is immediately cleared
by next wrmsr to MSR_CORE_PERF_GLOBAL_CTRL MSR and appears to remain
0.
On the other hand, on older processors such as Nehalem, Xeon E7540,
CondChgd bit is not set in the beginning at boot.
I'm not sure about exact behavior of CondChgd bit, in particular when
this bit is set. Although I read Intel System Programmer's Manual to
figure out that, the descriptions I found are:
In 18.9.1:
"The MSR_PERF_GLOBAL_STATUS MSR also provides a ¡sticky bit¢ to
indicate changes to the state of performancmonitoring hardware"
In Table 35-2 IA-32 Architectural MSRs
63 CondChg: status bits of this register has changed.
These are different from the bahviour I see on the actual system as I
explained above.
At least, I think ignoring CondChgd bit should be enough for NMI
watchdog perspective.
Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20140625.103503.409316067.d.hatayama@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-06-25 09:09:07 +08:00
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
/*
|
|
|
|
* PEBS overflow sets bit 62 in the global status register
|
|
|
|
*/
|
2020-07-24 01:11:05 +08:00
|
|
|
if (__test_and_clear_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, (unsigned long *)&status)) {
|
2020-01-22 02:13:38 +08:00
|
|
|
u64 pebs_enabled = cpuc->pebs_enabled;
|
|
|
|
|
2010-09-03 03:07:49 +08:00
|
|
|
handled++;
|
2020-10-30 21:58:48 +08:00
|
|
|
x86_pmu.drain_pebs(regs, &data);
|
2021-04-12 22:30:45 +08:00
|
|
|
status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
|
2020-01-22 02:13:38 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* PMI throttle may be triggered, which stops the PEBS event.
|
|
|
|
* Although cpuc->pebs_enabled is updated accordingly, the
|
|
|
|
* MSR_IA32_PEBS_ENABLE is not updated. Because the
|
|
|
|
* cpuc->enabled has been forced to 0 in PMI.
|
|
|
|
* Update the MSR if pebs_enabled is changed.
|
|
|
|
*/
|
|
|
|
if (pebs_enabled != cpuc->pebs_enabled)
|
|
|
|
wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
|
2010-09-03 03:07:49 +08:00
|
|
|
}
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
2015-01-30 18:39:52 +08:00
|
|
|
/*
|
|
|
|
* Intel PT
|
|
|
|
*/
|
2020-07-24 01:11:05 +08:00
|
|
|
if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) {
|
2015-01-30 18:39:52 +08:00
|
|
|
handled++;
|
2019-02-19 08:26:07 +08:00
|
|
|
if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() &&
|
|
|
|
perf_guest_cbs->handle_intel_pt_intr))
|
|
|
|
perf_guest_cbs->handle_intel_pt_intr();
|
|
|
|
else
|
|
|
|
intel_pt_interrupt();
|
2015-01-30 18:39:52 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
/*
|
|
|
|
* Intel Perf mertrics
|
|
|
|
*/
|
|
|
|
if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) {
|
|
|
|
handled++;
|
|
|
|
if (x86_pmu.update_topdown_event)
|
|
|
|
x86_pmu.update_topdown_event(NULL);
|
|
|
|
}
|
|
|
|
|
2013-09-06 11:37:38 +08:00
|
|
|
/*
|
2013-09-12 18:53:44 +08:00
|
|
|
* Checkpointed counters can lead to 'spurious' PMIs because the
|
|
|
|
* rollback caused by the PMI will have cleared the overflow status
|
|
|
|
* bit. Therefore always force probe these counters.
|
2013-09-06 11:37:38 +08:00
|
|
|
*/
|
2013-09-12 18:53:44 +08:00
|
|
|
status |= cpuc->intel_cp_status;
|
2013-09-06 11:37:38 +08:00
|
|
|
|
2010-03-06 05:41:37 +08:00
|
|
|
for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
|
2010-02-26 19:05:05 +08:00
|
|
|
struct perf_event *event = cpuc->events[bit];
|
|
|
|
|
2010-09-03 03:07:49 +08:00
|
|
|
handled++;
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
if (!test_bit(bit, cpuc->active_mask))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!intel_pmu_save_and_restart(event))
|
|
|
|
continue;
|
|
|
|
|
2012-04-03 02:19:08 +08:00
|
|
|
perf_sample_data_init(&data, 0, event->hw.last_period);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2012-02-10 06:20:57 +08:00
|
|
|
if (has_branch_stack(event))
|
|
|
|
data.br_stack = &cpuc->lbr_stack;
|
|
|
|
|
2011-06-27 20:41:57 +08:00
|
|
|
if (perf_event_overflow(event, &data, regs))
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
x86_pmu_stop(event, 0);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
2018-08-08 15:12:06 +08:00
|
|
|
return handled;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This handler is triggered by the local APIC, so the APIC IRQ handling
|
|
|
|
* rules apply:
|
|
|
|
*/
|
|
|
|
static int intel_pmu_handle_irq(struct pt_regs *regs)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc;
|
|
|
|
int loops;
|
|
|
|
u64 status;
|
|
|
|
int handled;
|
|
|
|
int pmu_enabled;
|
|
|
|
|
|
|
|
cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Save the PMU state.
|
|
|
|
* It needs to be restored when leaving the handler.
|
|
|
|
*/
|
|
|
|
pmu_enabled = cpuc->enabled;
|
|
|
|
/*
|
|
|
|
* No known reason to not always do late ACK,
|
|
|
|
* but just in case do it opt-in.
|
|
|
|
*/
|
|
|
|
if (!x86_pmu.late_ack)
|
|
|
|
apic_write(APIC_LVTPC, APIC_DM_NMI);
|
|
|
|
intel_bts_disable_local();
|
|
|
|
cpuc->enabled = 0;
|
|
|
|
__intel_pmu_disable_all();
|
|
|
|
handled = intel_pmu_drain_bts_buffer();
|
|
|
|
handled += intel_bts_interrupt();
|
|
|
|
status = intel_pmu_get_status();
|
|
|
|
if (!status)
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
loops = 0;
|
|
|
|
again:
|
|
|
|
intel_pmu_lbr_read();
|
|
|
|
intel_pmu_ack_status(status);
|
|
|
|
if (++loops > 100) {
|
|
|
|
static bool warned;
|
|
|
|
|
|
|
|
if (!warned) {
|
|
|
|
WARN(1, "perfevents: irq loop stuck!\n");
|
|
|
|
perf_event_print_debug();
|
|
|
|
warned = true;
|
|
|
|
}
|
|
|
|
intel_pmu_reset();
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
handled += handle_pmi_common(regs, status);
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
|
|
|
* Repeat if there is more work to be done:
|
|
|
|
*/
|
|
|
|
status = intel_pmu_get_status();
|
|
|
|
if (status)
|
|
|
|
goto again;
|
|
|
|
|
2010-03-08 20:51:01 +08:00
|
|
|
done:
|
perf/x86/intel: Fix PEBS warning by only restoring active PMU in pmi
This patch tries to fix a PEBS warning found in my stress test. The
following perf command can easily trigger the pebs warning or spurious
NMI error on Skylake/Broadwell/Haswell platforms:
sudo perf record -e 'cpu/umask=0x04,event=0xc4/pp,cycles,branches,ref-cycles,cache-misses,cache-references' --call-graph fp -b -c1000 -a
Also the NMI watchdog must be enabled.
For this case, the events number is larger than counter number. So
perf has to do multiplexing.
In perf_mux_hrtimer_handler, it does perf_pmu_disable(), schedule out
old events, rotate_ctx, schedule in new events and finally
perf_pmu_enable().
If the old events include precise event, the MSR_IA32_PEBS_ENABLE
should be cleared when perf_pmu_disable(). The MSR_IA32_PEBS_ENABLE
should keep 0 until the perf_pmu_enable() is called and the new event is
precise event.
However, there is a corner case which could restore PEBS_ENABLE to
stale value during the above period. In perf_pmu_disable(), GLOBAL_CTRL
will be set to 0 to stop overflow and followed PMI. But there may be
pending PMI from an earlier overflow, which cannot be stopped. So even
GLOBAL_CTRL is cleared, the kernel still be possible to get PMI. At
the end of the PMI handler, __intel_pmu_enable_all() will be called,
which will restore the stale values if old events haven't scheduled
out.
Once the stale pebs value is set, it's impossible to be corrected if
the new events are non-precise. Because the pebs_enabled will be set
to 0. x86_pmu.enable_all() will ignore the MSR_IA32_PEBS_ENABLE
setting. As a result, the following NMI with stale PEBS_ENABLE
trigger pebs warning.
The pending PMI after enabled=0 will become harmless if the NMI handler
does not change the state. This patch checks cpuc->enabled in pmi and
only restore the state when PMU is active.
Here is the dump:
Call Trace:
<NMI> [<ffffffff813c3a2e>] dump_stack+0x63/0x85
[<ffffffff810a46f2>] warn_slowpath_common+0x82/0xc0
[<ffffffff810a483a>] warn_slowpath_null+0x1a/0x20
[<ffffffff8100fe2e>] intel_pmu_drain_pebs_nhm+0x2be/0x320
[<ffffffff8100caa9>] intel_pmu_handle_irq+0x279/0x460
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff811f290d>] ? vunmap_page_range+0x20d/0x330
[<ffffffff811f2f11>] ? unmap_kernel_range_noflush+0x11/0x20
[<ffffffff8148379f>] ? ghes_copy_tofrom_phys+0x10f/0x2a0
[<ffffffff814839c8>] ? ghes_read_estatus+0x98/0x170
[<ffffffff81005a7d>] perf_event_nmi_handler+0x2d/0x50
[<ffffffff810310b9>] nmi_handle+0x69/0x120
[<ffffffff810316f6>] default_do_nmi+0xe6/0x100
[<ffffffff810317f2>] do_nmi+0xe2/0x130
[<ffffffff817aea71>] end_repeat_nmi+0x1a/0x1e
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
<<EOE>> <IRQ> [<ffffffff81006df8>] ? x86_perf_event_set_period+0xd8/0x180
[<ffffffff81006eec>] x86_pmu_start+0x4c/0x100
[<ffffffff8100722d>] x86_pmu_enable+0x28d/0x300
[<ffffffff811994d7>] perf_pmu_enable.part.81+0x7/0x10
[<ffffffff8119cb70>] perf_mux_hrtimer_handler+0x200/0x280
[<ffffffff8119c970>] ? __perf_install_in_context+0xc0/0xc0
[<ffffffff8110f92d>] __hrtimer_run_queues+0xfd/0x280
[<ffffffff811100d8>] hrtimer_interrupt+0xa8/0x190
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81051bd8>] local_apic_timer_interrupt+0x38/0x60
[<ffffffff817af01d>] smp_apic_timer_interrupt+0x3d/0x50
[<ffffffff817ad15c>] apic_timer_interrupt+0x8c/0xa0
<EOI> [<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81123de5>] ? smp_call_function_single+0xd5/0x130
[<ffffffff81123ddb>] ? smp_call_function_single+0xcb/0x130
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff8119765a>] event_function_call+0x10a/0x120
[<ffffffff8119c660>] ? ctx_resched+0x90/0x90
[<ffffffff811971e0>] ? cpu_clock_event_read+0x30/0x30
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff8119772b>] _perf_event_enable+0x5b/0x70
[<ffffffff81197388>] perf_event_for_each_child+0x38/0xa0
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff811a0ffd>] perf_ioctl+0x12d/0x3c0
[<ffffffff8134d855>] ? selinux_file_ioctl+0x95/0x1e0
[<ffffffff8124a3a1>] do_vfs_ioctl+0xa1/0x5a0
[<ffffffff81036d29>] ? sched_clock+0x9/0x10
[<ffffffff8124a919>] SyS_ioctl+0x79/0x90
[<ffffffff817ac4b2>] entry_SYSCALL_64_fastpath+0x1a/0xa4
---[ end trace aef202839fe9a71d ]---
Uhhuh. NMI received for unknown reason 2d on CPU 2.
Do you have a strange power saving mode enabled?
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1457046448-6184-1-git-send-email-kan.liang@intel.com
[ Fixed various typos and other small details. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-03-04 07:07:28 +08:00
|
|
|
/* Only restore PMU state when it's active. See x86_pmu_disable(). */
|
2018-02-20 18:11:50 +08:00
|
|
|
cpuc->enabled = pmu_enabled;
|
|
|
|
if (pmu_enabled)
|
perf/x86/intel: Fix PEBS warning by only restoring active PMU in pmi
This patch tries to fix a PEBS warning found in my stress test. The
following perf command can easily trigger the pebs warning or spurious
NMI error on Skylake/Broadwell/Haswell platforms:
sudo perf record -e 'cpu/umask=0x04,event=0xc4/pp,cycles,branches,ref-cycles,cache-misses,cache-references' --call-graph fp -b -c1000 -a
Also the NMI watchdog must be enabled.
For this case, the events number is larger than counter number. So
perf has to do multiplexing.
In perf_mux_hrtimer_handler, it does perf_pmu_disable(), schedule out
old events, rotate_ctx, schedule in new events and finally
perf_pmu_enable().
If the old events include precise event, the MSR_IA32_PEBS_ENABLE
should be cleared when perf_pmu_disable(). The MSR_IA32_PEBS_ENABLE
should keep 0 until the perf_pmu_enable() is called and the new event is
precise event.
However, there is a corner case which could restore PEBS_ENABLE to
stale value during the above period. In perf_pmu_disable(), GLOBAL_CTRL
will be set to 0 to stop overflow and followed PMI. But there may be
pending PMI from an earlier overflow, which cannot be stopped. So even
GLOBAL_CTRL is cleared, the kernel still be possible to get PMI. At
the end of the PMI handler, __intel_pmu_enable_all() will be called,
which will restore the stale values if old events haven't scheduled
out.
Once the stale pebs value is set, it's impossible to be corrected if
the new events are non-precise. Because the pebs_enabled will be set
to 0. x86_pmu.enable_all() will ignore the MSR_IA32_PEBS_ENABLE
setting. As a result, the following NMI with stale PEBS_ENABLE
trigger pebs warning.
The pending PMI after enabled=0 will become harmless if the NMI handler
does not change the state. This patch checks cpuc->enabled in pmi and
only restore the state when PMU is active.
Here is the dump:
Call Trace:
<NMI> [<ffffffff813c3a2e>] dump_stack+0x63/0x85
[<ffffffff810a46f2>] warn_slowpath_common+0x82/0xc0
[<ffffffff810a483a>] warn_slowpath_null+0x1a/0x20
[<ffffffff8100fe2e>] intel_pmu_drain_pebs_nhm+0x2be/0x320
[<ffffffff8100caa9>] intel_pmu_handle_irq+0x279/0x460
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff811f290d>] ? vunmap_page_range+0x20d/0x330
[<ffffffff811f2f11>] ? unmap_kernel_range_noflush+0x11/0x20
[<ffffffff8148379f>] ? ghes_copy_tofrom_phys+0x10f/0x2a0
[<ffffffff814839c8>] ? ghes_read_estatus+0x98/0x170
[<ffffffff81005a7d>] perf_event_nmi_handler+0x2d/0x50
[<ffffffff810310b9>] nmi_handle+0x69/0x120
[<ffffffff810316f6>] default_do_nmi+0xe6/0x100
[<ffffffff810317f2>] do_nmi+0xe2/0x130
[<ffffffff817aea71>] end_repeat_nmi+0x1a/0x1e
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
<<EOE>> <IRQ> [<ffffffff81006df8>] ? x86_perf_event_set_period+0xd8/0x180
[<ffffffff81006eec>] x86_pmu_start+0x4c/0x100
[<ffffffff8100722d>] x86_pmu_enable+0x28d/0x300
[<ffffffff811994d7>] perf_pmu_enable.part.81+0x7/0x10
[<ffffffff8119cb70>] perf_mux_hrtimer_handler+0x200/0x280
[<ffffffff8119c970>] ? __perf_install_in_context+0xc0/0xc0
[<ffffffff8110f92d>] __hrtimer_run_queues+0xfd/0x280
[<ffffffff811100d8>] hrtimer_interrupt+0xa8/0x190
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81051bd8>] local_apic_timer_interrupt+0x38/0x60
[<ffffffff817af01d>] smp_apic_timer_interrupt+0x3d/0x50
[<ffffffff817ad15c>] apic_timer_interrupt+0x8c/0xa0
<EOI> [<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81123de5>] ? smp_call_function_single+0xd5/0x130
[<ffffffff81123ddb>] ? smp_call_function_single+0xcb/0x130
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff8119765a>] event_function_call+0x10a/0x120
[<ffffffff8119c660>] ? ctx_resched+0x90/0x90
[<ffffffff811971e0>] ? cpu_clock_event_read+0x30/0x30
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff8119772b>] _perf_event_enable+0x5b/0x70
[<ffffffff81197388>] perf_event_for_each_child+0x38/0xa0
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff811a0ffd>] perf_ioctl+0x12d/0x3c0
[<ffffffff8134d855>] ? selinux_file_ioctl+0x95/0x1e0
[<ffffffff8124a3a1>] do_vfs_ioctl+0xa1/0x5a0
[<ffffffff81036d29>] ? sched_clock+0x9/0x10
[<ffffffff8124a919>] SyS_ioctl+0x79/0x90
[<ffffffff817ac4b2>] entry_SYSCALL_64_fastpath+0x1a/0xa4
---[ end trace aef202839fe9a71d ]---
Uhhuh. NMI received for unknown reason 2d on CPU 2.
Do you have a strange power saving mode enabled?
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1457046448-6184-1-git-send-email-kan.liang@intel.com
[ Fixed various typos and other small details. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-03-04 07:07:28 +08:00
|
|
|
__intel_pmu_enable_all(0, true);
|
2016-09-15 16:22:33 +08:00
|
|
|
intel_bts_enable_local();
|
perf/x86/intel: Fix PEBS warning by only restoring active PMU in pmi
This patch tries to fix a PEBS warning found in my stress test. The
following perf command can easily trigger the pebs warning or spurious
NMI error on Skylake/Broadwell/Haswell platforms:
sudo perf record -e 'cpu/umask=0x04,event=0xc4/pp,cycles,branches,ref-cycles,cache-misses,cache-references' --call-graph fp -b -c1000 -a
Also the NMI watchdog must be enabled.
For this case, the events number is larger than counter number. So
perf has to do multiplexing.
In perf_mux_hrtimer_handler, it does perf_pmu_disable(), schedule out
old events, rotate_ctx, schedule in new events and finally
perf_pmu_enable().
If the old events include precise event, the MSR_IA32_PEBS_ENABLE
should be cleared when perf_pmu_disable(). The MSR_IA32_PEBS_ENABLE
should keep 0 until the perf_pmu_enable() is called and the new event is
precise event.
However, there is a corner case which could restore PEBS_ENABLE to
stale value during the above period. In perf_pmu_disable(), GLOBAL_CTRL
will be set to 0 to stop overflow and followed PMI. But there may be
pending PMI from an earlier overflow, which cannot be stopped. So even
GLOBAL_CTRL is cleared, the kernel still be possible to get PMI. At
the end of the PMI handler, __intel_pmu_enable_all() will be called,
which will restore the stale values if old events haven't scheduled
out.
Once the stale pebs value is set, it's impossible to be corrected if
the new events are non-precise. Because the pebs_enabled will be set
to 0. x86_pmu.enable_all() will ignore the MSR_IA32_PEBS_ENABLE
setting. As a result, the following NMI with stale PEBS_ENABLE
trigger pebs warning.
The pending PMI after enabled=0 will become harmless if the NMI handler
does not change the state. This patch checks cpuc->enabled in pmi and
only restore the state when PMU is active.
Here is the dump:
Call Trace:
<NMI> [<ffffffff813c3a2e>] dump_stack+0x63/0x85
[<ffffffff810a46f2>] warn_slowpath_common+0x82/0xc0
[<ffffffff810a483a>] warn_slowpath_null+0x1a/0x20
[<ffffffff8100fe2e>] intel_pmu_drain_pebs_nhm+0x2be/0x320
[<ffffffff8100caa9>] intel_pmu_handle_irq+0x279/0x460
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff811f290d>] ? vunmap_page_range+0x20d/0x330
[<ffffffff811f2f11>] ? unmap_kernel_range_noflush+0x11/0x20
[<ffffffff8148379f>] ? ghes_copy_tofrom_phys+0x10f/0x2a0
[<ffffffff814839c8>] ? ghes_read_estatus+0x98/0x170
[<ffffffff81005a7d>] perf_event_nmi_handler+0x2d/0x50
[<ffffffff810310b9>] nmi_handle+0x69/0x120
[<ffffffff810316f6>] default_do_nmi+0xe6/0x100
[<ffffffff810317f2>] do_nmi+0xe2/0x130
[<ffffffff817aea71>] end_repeat_nmi+0x1a/0x1e
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
<<EOE>> <IRQ> [<ffffffff81006df8>] ? x86_perf_event_set_period+0xd8/0x180
[<ffffffff81006eec>] x86_pmu_start+0x4c/0x100
[<ffffffff8100722d>] x86_pmu_enable+0x28d/0x300
[<ffffffff811994d7>] perf_pmu_enable.part.81+0x7/0x10
[<ffffffff8119cb70>] perf_mux_hrtimer_handler+0x200/0x280
[<ffffffff8119c970>] ? __perf_install_in_context+0xc0/0xc0
[<ffffffff8110f92d>] __hrtimer_run_queues+0xfd/0x280
[<ffffffff811100d8>] hrtimer_interrupt+0xa8/0x190
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81051bd8>] local_apic_timer_interrupt+0x38/0x60
[<ffffffff817af01d>] smp_apic_timer_interrupt+0x3d/0x50
[<ffffffff817ad15c>] apic_timer_interrupt+0x8c/0xa0
<EOI> [<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81123de5>] ? smp_call_function_single+0xd5/0x130
[<ffffffff81123ddb>] ? smp_call_function_single+0xcb/0x130
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff8119765a>] event_function_call+0x10a/0x120
[<ffffffff8119c660>] ? ctx_resched+0x90/0x90
[<ffffffff811971e0>] ? cpu_clock_event_read+0x30/0x30
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff8119772b>] _perf_event_enable+0x5b/0x70
[<ffffffff81197388>] perf_event_for_each_child+0x38/0xa0
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff811a0ffd>] perf_ioctl+0x12d/0x3c0
[<ffffffff8134d855>] ? selinux_file_ioctl+0x95/0x1e0
[<ffffffff8124a3a1>] do_vfs_ioctl+0xa1/0x5a0
[<ffffffff81036d29>] ? sched_clock+0x9/0x10
[<ffffffff8124a919>] SyS_ioctl+0x79/0x90
[<ffffffff817ac4b2>] entry_SYSCALL_64_fastpath+0x1a/0xa4
---[ end trace aef202839fe9a71d ]---
Uhhuh. NMI received for unknown reason 2d on CPU 2.
Do you have a strange power saving mode enabled?
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1457046448-6184-1-git-send-email-kan.liang@intel.com
[ Fixed various typos and other small details. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-03-04 07:07:28 +08:00
|
|
|
|
2013-06-18 08:36:50 +08:00
|
|
|
/*
|
|
|
|
* Only unmask the NMI after the overflow counters
|
|
|
|
* have been reset. This avoids spurious NMIs on
|
|
|
|
* Haswell CPUs.
|
|
|
|
*/
|
|
|
|
if (x86_pmu.late_ack)
|
|
|
|
apic_write(APIC_LVTPC, APIC_DM_NMI);
|
2010-09-03 03:07:49 +08:00
|
|
|
return handled;
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct event_constraint *
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
intel_bts_constraints(struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2018-11-21 18:16:11 +08:00
|
|
|
if (unlikely(intel_pmu_has_bts(event)))
|
2010-02-26 19:05:05 +08:00
|
|
|
return &bts_constraint;
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2020-06-13 16:09:49 +08:00
|
|
|
/*
|
|
|
|
* Note: matches a fake event, like Fixed2.
|
|
|
|
*/
|
|
|
|
static struct event_constraint *
|
|
|
|
intel_vlbr_constraints(struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct event_constraint *c = &vlbr_constraint;
|
|
|
|
|
|
|
|
if (unlikely(constraint_match(c, event->hw.config)))
|
|
|
|
return c;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-04-12 22:30:50 +08:00
|
|
|
static int intel_alt_er(struct cpu_hw_events *cpuc,
|
|
|
|
int idx, u64 config)
|
2011-05-23 17:08:15 +08:00
|
|
|
{
|
2021-04-12 22:30:50 +08:00
|
|
|
struct extra_reg *extra_regs = hybrid(cpuc->pmu, extra_regs);
|
2016-01-28 06:24:29 +08:00
|
|
|
int alt_idx = idx;
|
|
|
|
|
2014-11-18 03:06:53 +08:00
|
|
|
if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
|
2012-06-05 21:30:31 +08:00
|
|
|
return idx;
|
2011-05-23 17:08:15 +08:00
|
|
|
|
2012-06-05 21:30:31 +08:00
|
|
|
if (idx == EXTRA_REG_RSP_0)
|
2015-06-25 02:23:35 +08:00
|
|
|
alt_idx = EXTRA_REG_RSP_1;
|
2012-06-05 21:30:31 +08:00
|
|
|
|
|
|
|
if (idx == EXTRA_REG_RSP_1)
|
2015-06-25 02:23:35 +08:00
|
|
|
alt_idx = EXTRA_REG_RSP_0;
|
2012-06-05 21:30:31 +08:00
|
|
|
|
2021-04-12 22:30:50 +08:00
|
|
|
if (config & ~extra_regs[alt_idx].valid_mask)
|
2015-06-25 02:23:35 +08:00
|
|
|
return idx;
|
|
|
|
|
|
|
|
return alt_idx;
|
2012-06-05 21:30:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_fixup_er(struct perf_event *event, int idx)
|
|
|
|
{
|
2021-04-12 22:30:50 +08:00
|
|
|
struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs);
|
2012-06-05 21:30:31 +08:00
|
|
|
event->hw.extra_reg.idx = idx;
|
|
|
|
|
|
|
|
if (idx == EXTRA_REG_RSP_0) {
|
2011-05-23 17:08:15 +08:00
|
|
|
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
|
2021-04-12 22:30:50 +08:00
|
|
|
event->hw.config |= extra_regs[EXTRA_REG_RSP_0].event;
|
2011-05-23 17:08:15 +08:00
|
|
|
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
|
2012-06-05 21:30:31 +08:00
|
|
|
} else if (idx == EXTRA_REG_RSP_1) {
|
|
|
|
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
|
2021-04-12 22:30:50 +08:00
|
|
|
event->hw.config |= extra_regs[EXTRA_REG_RSP_1].event;
|
2012-06-05 21:30:31 +08:00
|
|
|
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
|
2011-05-23 17:08:15 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
/*
|
|
|
|
* manage allocation of shared extra msr for certain events
|
|
|
|
*
|
|
|
|
* sharing can be:
|
|
|
|
* per-cpu: to be shared between the various events on a single PMU
|
|
|
|
* per-core: per-cpu + shared by HT threads
|
|
|
|
*/
|
2011-03-03 10:34:47 +08:00
|
|
|
static struct event_constraint *
|
2011-06-06 22:57:03 +08:00
|
|
|
__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
|
2012-02-10 06:20:53 +08:00
|
|
|
struct perf_event *event,
|
|
|
|
struct hw_perf_event_extra *reg)
|
2011-03-03 10:34:47 +08:00
|
|
|
{
|
2011-06-06 22:57:03 +08:00
|
|
|
struct event_constraint *c = &emptyconstraint;
|
2011-03-03 10:34:47 +08:00
|
|
|
struct er_account *era;
|
2011-06-06 22:57:08 +08:00
|
|
|
unsigned long flags;
|
2012-06-05 21:30:31 +08:00
|
|
|
int idx = reg->idx;
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2012-06-05 21:30:31 +08:00
|
|
|
/*
|
|
|
|
* reg->alloc can be set due to existing state, so for fake cpuc we
|
|
|
|
* need to ignore this, otherwise we might fail to allocate proper fake
|
|
|
|
* state for this extra reg constraint. Also see the comment below.
|
|
|
|
*/
|
|
|
|
if (reg->alloc && !cpuc->is_fake)
|
2012-02-10 06:20:53 +08:00
|
|
|
return NULL; /* call x86_get_event_constraint() */
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2011-05-23 17:08:15 +08:00
|
|
|
again:
|
2012-06-05 21:30:31 +08:00
|
|
|
era = &cpuc->shared_regs->regs[idx];
|
2011-06-06 22:57:08 +08:00
|
|
|
/*
|
|
|
|
* we use spin_lock_irqsave() to avoid lockdep issues when
|
|
|
|
* passing a fake cpuc
|
|
|
|
*/
|
|
|
|
raw_spin_lock_irqsave(&era->lock, flags);
|
2011-06-06 22:57:03 +08:00
|
|
|
|
|
|
|
if (!atomic_read(&era->ref) || era->config == reg->config) {
|
|
|
|
|
2012-06-05 21:30:31 +08:00
|
|
|
/*
|
|
|
|
* If its a fake cpuc -- as per validate_{group,event}() we
|
|
|
|
* shouldn't touch event state and we can avoid doing so
|
|
|
|
* since both will only call get_event_constraints() once
|
|
|
|
* on each event, this avoids the need for reg->alloc.
|
|
|
|
*
|
|
|
|
* Not doing the ER fixup will only result in era->reg being
|
|
|
|
* wrong, but since we won't actually try and program hardware
|
|
|
|
* this isn't a problem either.
|
|
|
|
*/
|
|
|
|
if (!cpuc->is_fake) {
|
|
|
|
if (idx != reg->idx)
|
|
|
|
intel_fixup_er(event, idx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* x86_schedule_events() can call get_event_constraints()
|
|
|
|
* multiple times on events in the case of incremental
|
|
|
|
* scheduling(). reg->alloc ensures we only do the ER
|
|
|
|
* allocation once.
|
|
|
|
*/
|
|
|
|
reg->alloc = 1;
|
|
|
|
}
|
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
/* lock in msr value */
|
|
|
|
era->config = reg->config;
|
|
|
|
era->reg = reg->reg;
|
|
|
|
|
|
|
|
/* one more user */
|
|
|
|
atomic_inc(&era->ref);
|
|
|
|
|
2011-03-03 10:34:47 +08:00
|
|
|
/*
|
2012-02-10 06:20:53 +08:00
|
|
|
* need to call x86_get_event_constraint()
|
|
|
|
* to check if associated event has constraints
|
2011-03-03 10:34:47 +08:00
|
|
|
*/
|
2012-02-10 06:20:53 +08:00
|
|
|
c = NULL;
|
2012-06-05 21:30:31 +08:00
|
|
|
} else {
|
2021-04-12 22:30:50 +08:00
|
|
|
idx = intel_alt_er(cpuc, idx, reg->config);
|
2012-06-05 21:30:31 +08:00
|
|
|
if (idx != reg->idx) {
|
|
|
|
raw_spin_unlock_irqrestore(&era->lock, flags);
|
|
|
|
goto again;
|
|
|
|
}
|
2011-03-03 10:34:47 +08:00
|
|
|
}
|
2011-06-06 22:57:08 +08:00
|
|
|
raw_spin_unlock_irqrestore(&era->lock, flags);
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
|
|
|
|
struct hw_perf_event_extra *reg)
|
|
|
|
{
|
|
|
|
struct er_account *era;
|
|
|
|
|
|
|
|
/*
|
2012-06-05 21:30:31 +08:00
|
|
|
* Only put constraint if extra reg was actually allocated. Also takes
|
|
|
|
* care of event which do not use an extra shared reg.
|
|
|
|
*
|
|
|
|
* Also, if this is a fake cpuc we shouldn't touch any event state
|
|
|
|
* (reg->alloc) and we don't care about leaving inconsistent cpuc state
|
|
|
|
* either since it'll be thrown out.
|
2011-06-06 22:57:03 +08:00
|
|
|
*/
|
2012-06-05 21:30:31 +08:00
|
|
|
if (!reg->alloc || cpuc->is_fake)
|
2011-06-06 22:57:03 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
era = &cpuc->shared_regs->regs[reg->idx];
|
|
|
|
|
|
|
|
/* one fewer user */
|
|
|
|
atomic_dec(&era->ref);
|
|
|
|
|
|
|
|
/* allocate again next time */
|
|
|
|
reg->alloc = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct event_constraint *
|
|
|
|
intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
2012-02-10 06:20:53 +08:00
|
|
|
struct event_constraint *c = NULL, *d;
|
|
|
|
struct hw_perf_event_extra *xreg, *breg;
|
|
|
|
|
|
|
|
xreg = &event->hw.extra_reg;
|
|
|
|
if (xreg->idx != EXTRA_REG_NONE) {
|
|
|
|
c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
|
|
|
|
if (c == &emptyconstraint)
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
breg = &event->hw.branch_reg;
|
|
|
|
if (breg->idx != EXTRA_REG_NONE) {
|
|
|
|
d = __intel_shared_reg_get_constraints(cpuc, event, breg);
|
|
|
|
if (d == &emptyconstraint) {
|
|
|
|
__intel_shared_reg_put_constraints(cpuc, xreg);
|
|
|
|
c = d;
|
|
|
|
}
|
|
|
|
}
|
2011-06-06 22:57:03 +08:00
|
|
|
return c;
|
2011-03-03 10:34:47 +08:00
|
|
|
}
|
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
struct event_constraint *
|
2014-11-18 03:06:56 +08:00
|
|
|
x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
2011-08-31 07:41:05 +08:00
|
|
|
{
|
2021-04-12 22:30:49 +08:00
|
|
|
struct event_constraint *event_constraints = hybrid(cpuc->pmu, event_constraints);
|
2011-08-31 07:41:05 +08:00
|
|
|
struct event_constraint *c;
|
|
|
|
|
2021-04-12 22:30:49 +08:00
|
|
|
if (event_constraints) {
|
|
|
|
for_each_event_constraint(c, event_constraints) {
|
2019-04-03 03:45:04 +08:00
|
|
|
if (constraint_match(c, event->hw.config)) {
|
2013-01-24 23:10:27 +08:00
|
|
|
event->hw.flags |= c->flags;
|
2011-08-31 07:41:05 +08:00
|
|
|
return c;
|
2013-01-24 23:10:27 +08:00
|
|
|
}
|
2011-08-31 07:41:05 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-12 22:30:47 +08:00
|
|
|
return &hybrid_var(cpuc->pmu, unconstrained);
|
2011-08-31 07:41:05 +08:00
|
|
|
}
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
static struct event_constraint *
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
2014-11-18 03:06:56 +08:00
|
|
|
struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
|
|
|
struct event_constraint *c;
|
|
|
|
|
2020-06-13 16:09:49 +08:00
|
|
|
c = intel_vlbr_constraints(event);
|
|
|
|
if (c)
|
|
|
|
return c;
|
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
c = intel_bts_constraints(event);
|
|
|
|
if (c)
|
|
|
|
return c;
|
|
|
|
|
2015-03-27 22:38:25 +08:00
|
|
|
c = intel_shared_regs_constraints(cpuc, event);
|
2010-02-26 19:05:05 +08:00
|
|
|
if (c)
|
|
|
|
return c;
|
|
|
|
|
2015-03-27 22:38:25 +08:00
|
|
|
c = intel_pebs_constraints(event);
|
2011-03-03 10:34:47 +08:00
|
|
|
if (c)
|
|
|
|
return c;
|
|
|
|
|
2014-11-18 03:06:56 +08:00
|
|
|
return x86_get_event_constraints(cpuc, idx, event);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
static void
|
|
|
|
intel_start_scheduling(struct cpu_hw_events *cpuc)
|
|
|
|
{
|
|
|
|
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
|
2015-05-21 16:57:21 +08:00
|
|
|
struct intel_excl_states *xl;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
int tid = cpuc->excl_thread_id;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* nothing needed if in group validation mode
|
|
|
|
*/
|
2014-11-18 03:07:04 +08:00
|
|
|
if (cpuc->is_fake || !is_ht_workaround_enabled())
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return;
|
2014-11-18 03:07:04 +08:00
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* no exclusion needed
|
|
|
|
*/
|
2015-05-21 16:57:28 +08:00
|
|
|
if (WARN_ON_ONCE(!excl_cntrs))
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
xl = &excl_cntrs->states[tid];
|
|
|
|
|
|
|
|
xl->sched_started = true;
|
|
|
|
/*
|
|
|
|
* lock shared state until we are done scheduling
|
|
|
|
* in stop_event_scheduling()
|
|
|
|
* makes scheduling appear as a transaction
|
|
|
|
*/
|
|
|
|
raw_spin_lock(&excl_cntrs->lock);
|
|
|
|
}
|
|
|
|
|
2015-05-21 16:57:32 +08:00
|
|
|
static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
|
|
|
|
{
|
|
|
|
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
|
|
|
|
struct event_constraint *c = cpuc->event_constraint[idx];
|
|
|
|
struct intel_excl_states *xl;
|
|
|
|
int tid = cpuc->excl_thread_id;
|
|
|
|
|
|
|
|
if (cpuc->is_fake || !is_ht_workaround_enabled())
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(!excl_cntrs))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
|
|
|
|
return;
|
|
|
|
|
|
|
|
xl = &excl_cntrs->states[tid];
|
|
|
|
|
|
|
|
lockdep_assert_held(&excl_cntrs->lock);
|
|
|
|
|
2015-05-21 16:57:36 +08:00
|
|
|
if (c->flags & PERF_X86_EVENT_EXCL)
|
2015-05-21 16:57:39 +08:00
|
|
|
xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
|
2015-05-21 16:57:36 +08:00
|
|
|
else
|
2015-05-21 16:57:39 +08:00
|
|
|
xl->state[cntr] = INTEL_EXCL_SHARED;
|
2015-05-21 16:57:32 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
static void
|
|
|
|
intel_stop_scheduling(struct cpu_hw_events *cpuc)
|
|
|
|
{
|
|
|
|
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
|
2015-05-21 16:57:21 +08:00
|
|
|
struct intel_excl_states *xl;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
int tid = cpuc->excl_thread_id;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* nothing needed if in group validation mode
|
|
|
|
*/
|
2014-11-18 03:07:04 +08:00
|
|
|
if (cpuc->is_fake || !is_ht_workaround_enabled())
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return;
|
|
|
|
/*
|
|
|
|
* no exclusion needed
|
|
|
|
*/
|
2015-05-21 16:57:28 +08:00
|
|
|
if (WARN_ON_ONCE(!excl_cntrs))
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
xl = &excl_cntrs->states[tid];
|
|
|
|
|
|
|
|
xl->sched_started = false;
|
|
|
|
/*
|
|
|
|
* release shared state lock (acquired in intel_start_scheduling())
|
|
|
|
*/
|
|
|
|
raw_spin_unlock(&excl_cntrs->lock);
|
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:16 +08:00
|
|
|
static struct event_constraint *
|
|
|
|
dyn_constraint(struct cpu_hw_events *cpuc, struct event_constraint *c, int idx)
|
|
|
|
{
|
|
|
|
WARN_ON_ONCE(!cpuc->constraint_list);
|
|
|
|
|
|
|
|
if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
|
|
|
|
struct event_constraint *cx;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* grab pre-allocated constraint entry
|
|
|
|
*/
|
|
|
|
cx = &cpuc->constraint_list[idx];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* initialize dynamic constraint
|
|
|
|
* with static constraint
|
|
|
|
*/
|
|
|
|
*cx = *c;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* mark constraint as dynamic
|
|
|
|
*/
|
|
|
|
cx->flags |= PERF_X86_EVENT_DYNAMIC;
|
|
|
|
c = cx;
|
|
|
|
}
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
static struct event_constraint *
|
|
|
|
intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
|
|
|
|
int idx, struct event_constraint *c)
|
|
|
|
{
|
|
|
|
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
|
2015-05-21 16:57:21 +08:00
|
|
|
struct intel_excl_states *xlo;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
int tid = cpuc->excl_thread_id;
|
2019-03-14 20:01:14 +08:00
|
|
|
int is_excl, i, w;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* validating a group does not require
|
|
|
|
* enforcing cross-thread exclusion
|
|
|
|
*/
|
2014-11-18 03:07:04 +08:00
|
|
|
if (cpuc->is_fake || !is_ht_workaround_enabled())
|
|
|
|
return c;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* no exclusion needed
|
|
|
|
*/
|
2015-05-21 16:57:28 +08:00
|
|
|
if (WARN_ON_ONCE(!excl_cntrs))
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return c;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* because we modify the constraint, we need
|
|
|
|
* to make a copy. Static constraints come
|
|
|
|
* from static const tables.
|
|
|
|
*
|
|
|
|
* only needed when constraint has not yet
|
|
|
|
* been cloned (marked dynamic)
|
|
|
|
*/
|
2019-03-06 05:23:16 +08:00
|
|
|
c = dyn_constraint(cpuc, c, idx);
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* From here on, the constraint is dynamic.
|
|
|
|
* Either it was just allocated above, or it
|
|
|
|
* was allocated during a earlier invocation
|
|
|
|
* of this function
|
|
|
|
*/
|
|
|
|
|
2015-05-21 16:57:21 +08:00
|
|
|
/*
|
|
|
|
* state of sibling HT
|
|
|
|
*/
|
|
|
|
xlo = &excl_cntrs->states[tid ^ 1];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* event requires exclusive counter access
|
|
|
|
* across HT threads
|
|
|
|
*/
|
|
|
|
is_excl = c->flags & PERF_X86_EVENT_EXCL;
|
|
|
|
if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
|
|
|
|
event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
|
|
|
|
if (!cpuc->n_excl++)
|
|
|
|
WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* Modify static constraint with current dynamic
|
|
|
|
* state of thread
|
|
|
|
*
|
|
|
|
* EXCLUSIVE: sibling counter measuring exclusive event
|
|
|
|
* SHARED : sibling counter measuring non-exclusive event
|
|
|
|
* UNUSED : sibling counter unused
|
|
|
|
*/
|
2019-03-14 20:01:14 +08:00
|
|
|
w = c->weight;
|
2015-05-21 16:57:24 +08:00
|
|
|
for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* exclusive event in sibling counter
|
|
|
|
* our corresponding counter cannot be used
|
|
|
|
* regardless of our event
|
|
|
|
*/
|
2019-03-14 20:01:14 +08:00
|
|
|
if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE) {
|
2015-05-21 16:57:24 +08:00
|
|
|
__clear_bit(i, c->idxmsk);
|
2019-03-14 20:01:14 +08:00
|
|
|
w--;
|
|
|
|
continue;
|
|
|
|
}
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* if measuring an exclusive event, sibling
|
|
|
|
* measuring non-exclusive, then counter cannot
|
|
|
|
* be used
|
|
|
|
*/
|
2019-03-14 20:01:14 +08:00
|
|
|
if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED) {
|
2015-05-21 16:57:24 +08:00
|
|
|
__clear_bit(i, c->idxmsk);
|
2019-03-14 20:01:14 +08:00
|
|
|
w--;
|
|
|
|
continue;
|
|
|
|
}
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* if we return an empty mask, then switch
|
|
|
|
* back to static empty constraint to avoid
|
|
|
|
* the cost of freeing later on
|
|
|
|
*/
|
2019-03-14 20:01:14 +08:00
|
|
|
if (!w)
|
2015-05-21 16:57:24 +08:00
|
|
|
c = &emptyconstraint;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
2019-03-14 20:01:14 +08:00
|
|
|
c->weight = w;
|
|
|
|
|
2015-05-21 16:57:24 +08:00
|
|
|
return c;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct event_constraint *
|
|
|
|
intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
2019-03-14 16:57:57 +08:00
|
|
|
struct event_constraint *c1, *c2;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
2019-03-14 16:57:57 +08:00
|
|
|
c1 = cpuc->event_constraint[idx];
|
2015-09-10 17:58:27 +08:00
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* first time only
|
|
|
|
* - static constraint: no change across incremental scheduling calls
|
|
|
|
* - dynamic constraint: handled by intel_get_excl_constraints()
|
|
|
|
*/
|
2014-11-18 03:07:01 +08:00
|
|
|
c2 = __intel_get_event_constraints(cpuc, idx, event);
|
2019-03-14 20:17:51 +08:00
|
|
|
if (c1) {
|
|
|
|
WARN_ON_ONCE(!(c1->flags & PERF_X86_EVENT_DYNAMIC));
|
2014-11-18 03:07:01 +08:00
|
|
|
bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
|
|
|
|
c1->weight = c2->weight;
|
|
|
|
c2 = c1;
|
|
|
|
}
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
if (cpuc->excl_cntrs)
|
2014-11-18 03:07:01 +08:00
|
|
|
return intel_get_excl_constraints(cpuc, event, idx, c2);
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
2014-11-18 03:07:01 +08:00
|
|
|
return c2;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
|
|
|
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
|
|
|
|
int tid = cpuc->excl_thread_id;
|
2015-05-21 16:57:21 +08:00
|
|
|
struct intel_excl_states *xl;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* nothing needed if in group validation mode
|
|
|
|
*/
|
|
|
|
if (cpuc->is_fake)
|
|
|
|
return;
|
|
|
|
|
2015-05-21 16:57:28 +08:00
|
|
|
if (WARN_ON_ONCE(!excl_cntrs))
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return;
|
|
|
|
|
2015-05-21 16:57:17 +08:00
|
|
|
if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
|
|
|
|
hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
|
|
|
|
if (!--cpuc->n_excl)
|
|
|
|
WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
|
|
|
|
}
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
2015-05-22 17:36:13 +08:00
|
|
|
* If event was actually assigned, then mark the counter state as
|
|
|
|
* unused now.
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
*/
|
2015-05-22 17:36:13 +08:00
|
|
|
if (hwc->idx >= 0) {
|
|
|
|
xl = &excl_cntrs->states[tid];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* put_constraint may be called from x86_schedule_events()
|
|
|
|
* which already has the lock held so here make locking
|
|
|
|
* conditional.
|
|
|
|
*/
|
|
|
|
if (!xl->sched_started)
|
|
|
|
raw_spin_lock(&excl_cntrs->lock);
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
2015-05-21 16:57:21 +08:00
|
|
|
xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
2015-05-22 17:36:13 +08:00
|
|
|
if (!xl->sched_started)
|
|
|
|
raw_spin_unlock(&excl_cntrs->lock);
|
|
|
|
}
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
static void
|
|
|
|
intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
|
2011-03-03 10:34:47 +08:00
|
|
|
struct perf_event *event)
|
|
|
|
{
|
2011-06-06 22:57:03 +08:00
|
|
|
struct hw_perf_event_extra *reg;
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
reg = &event->hw.extra_reg;
|
|
|
|
if (reg->idx != EXTRA_REG_NONE)
|
|
|
|
__intel_shared_reg_put_constraints(cpuc, reg);
|
2012-02-10 06:20:53 +08:00
|
|
|
|
|
|
|
reg = &event->hw.branch_reg;
|
|
|
|
if (reg->idx != EXTRA_REG_NONE)
|
|
|
|
__intel_shared_reg_put_constraints(cpuc, reg);
|
2011-06-06 22:57:03 +08:00
|
|
|
}
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
intel_put_shared_regs_event_constraints(cpuc, event);
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* is PMU has exclusive counter restrictions, then
|
|
|
|
* all events are subject to and must call the
|
|
|
|
* put_excl_constraints() routine
|
|
|
|
*/
|
2015-05-21 16:57:13 +08:00
|
|
|
if (cpuc->excl_cntrs)
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
intel_put_excl_constraints(cpuc, event);
|
|
|
|
}
|
|
|
|
|
2012-06-05 16:26:43 +08:00
|
|
|
static void intel_pebs_aliases_core2(struct perf_event *event)
|
2010-03-30 23:00:06 +08:00
|
|
|
{
|
2012-06-05 16:26:43 +08:00
|
|
|
if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
|
2010-12-15 04:26:40 +08:00
|
|
|
/*
|
|
|
|
* Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
|
|
|
|
* (0x003c) so that we can use it with PEBS.
|
|
|
|
*
|
|
|
|
* The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
|
|
|
|
* PEBS capable. However we can use INST_RETIRED.ANY_P
|
|
|
|
* (0x00c0), which is a PEBS capable event, to get the same
|
|
|
|
* count.
|
|
|
|
*
|
|
|
|
* INST_RETIRED.ANY_P counts the number of cycles that retires
|
|
|
|
* CNTMASK instructions. By setting CNTMASK to a value (16)
|
|
|
|
* larger than the maximum number of instructions that can be
|
|
|
|
* retired per cycle (4) and then inverting the condition, we
|
|
|
|
* count all cycles that retire 16 or less instructions, which
|
|
|
|
* is every cycle.
|
|
|
|
*
|
|
|
|
* Thereby we gain a PEBS capable cycle counter.
|
|
|
|
*/
|
2012-03-12 19:44:35 +08:00
|
|
|
u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
|
|
|
|
|
2012-06-05 16:26:43 +08:00
|
|
|
alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
|
|
|
|
event->hw.config = alt_config;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pebs_aliases_snb(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
|
|
|
|
/*
|
|
|
|
* Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
|
|
|
|
* (0x003c) so that we can use it with PEBS.
|
|
|
|
*
|
|
|
|
* The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
|
|
|
|
* PEBS capable. However we can use UOPS_RETIRED.ALL
|
|
|
|
* (0x01c2), which is a PEBS capable event, to get the same
|
|
|
|
* count.
|
|
|
|
*
|
|
|
|
* UOPS_RETIRED.ALL counts the number of cycles that retires
|
|
|
|
* CNTMASK micro-ops. By setting CNTMASK to a value (16)
|
|
|
|
* larger than the maximum number of micro-ops that can be
|
|
|
|
* retired per cycle (4) and then inverting the condition, we
|
|
|
|
* count all cycles that retire 16 or less micro-ops, which
|
|
|
|
* is every cycle.
|
|
|
|
*
|
|
|
|
* Thereby we gain a PEBS capable cycle counter.
|
|
|
|
*/
|
|
|
|
u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
|
2010-12-15 04:26:40 +08:00
|
|
|
|
|
|
|
alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
|
|
|
|
event->hw.config = alt_config;
|
|
|
|
}
|
2012-06-05 16:26:43 +08:00
|
|
|
}
|
|
|
|
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
static void intel_pebs_aliases_precdist(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
|
|
|
|
/*
|
|
|
|
* Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
|
|
|
|
* (0x003c) so that we can use it with PEBS.
|
|
|
|
*
|
|
|
|
* The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
|
|
|
|
* PEBS capable. However we can use INST_RETIRED.PREC_DIST
|
|
|
|
* (0x01c0), which is a PEBS capable event, to get the same
|
|
|
|
* count.
|
|
|
|
*
|
|
|
|
* The PREC_DIST event has special support to minimize sample
|
|
|
|
* shadowing effects. One drawback is that it can be
|
|
|
|
* only programmed on counter 1, but that seems like an
|
|
|
|
* acceptable trade off.
|
|
|
|
*/
|
|
|
|
u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
|
|
|
|
|
|
|
|
alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
|
|
|
|
event->hw.config = alt_config;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pebs_aliases_ivb(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (event->attr.precise_ip < 3)
|
|
|
|
return intel_pebs_aliases_snb(event);
|
|
|
|
return intel_pebs_aliases_precdist(event);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pebs_aliases_skl(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (event->attr.precise_ip < 3)
|
|
|
|
return intel_pebs_aliases_core2(event);
|
|
|
|
return intel_pebs_aliases_precdist(event);
|
|
|
|
}
|
|
|
|
|
2018-03-12 22:45:37 +08:00
|
|
|
static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event)
|
2015-05-28 12:13:14 +08:00
|
|
|
{
|
2018-03-12 22:45:37 +08:00
|
|
|
unsigned long flags = x86_pmu.large_pebs_flags;
|
2015-05-28 12:13:14 +08:00
|
|
|
|
|
|
|
if (event->attr.use_clockid)
|
|
|
|
flags &= ~PERF_SAMPLE_TIME;
|
2017-09-01 05:46:30 +08:00
|
|
|
if (!event->attr.exclude_kernel)
|
|
|
|
flags &= ~PERF_SAMPLE_REGS_USER;
|
2019-04-03 03:44:58 +08:00
|
|
|
if (event->attr.sample_regs_user & ~PEBS_GP_REGS)
|
2017-09-01 05:46:30 +08:00
|
|
|
flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR);
|
2015-05-28 12:13:14 +08:00
|
|
|
return flags;
|
|
|
|
}
|
|
|
|
|
2018-11-21 18:16:10 +08:00
|
|
|
static int intel_pmu_bts_config(struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct perf_event_attr *attr = &event->attr;
|
|
|
|
|
2018-11-21 18:16:11 +08:00
|
|
|
if (unlikely(intel_pmu_has_bts(event))) {
|
2018-11-21 18:16:10 +08:00
|
|
|
/* BTS is not supported by this architecture. */
|
|
|
|
if (!x86_pmu.bts_active)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
/* BTS is currently only allowed for user-mode. */
|
|
|
|
if (!attr->exclude_kernel)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
2018-11-21 18:16:12 +08:00
|
|
|
/* BTS is not allowed for precise events. */
|
|
|
|
if (attr->precise_ip)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
2018-11-21 18:16:10 +08:00
|
|
|
/* disallow bts if conflicting events are present */
|
|
|
|
if (x86_add_exclusive(x86_lbr_exclusive_lbr))
|
|
|
|
return -EBUSY;
|
|
|
|
|
|
|
|
event->destroy = hw_perf_lbr_event_destroy;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int core_pmu_hw_config(struct perf_event *event)
|
|
|
|
{
|
|
|
|
int ret = x86_pmu_hw_config(event);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
return intel_pmu_bts_config(event);
|
|
|
|
}
|
|
|
|
|
2021-01-29 06:40:09 +08:00
|
|
|
#define INTEL_TD_METRIC_AVAILABLE_MAX (INTEL_TD_METRIC_RETIRING + \
|
|
|
|
((x86_pmu.num_topdown_events - 1) << 8))
|
|
|
|
|
|
|
|
static bool is_available_metric_event(struct perf_event *event)
|
|
|
|
{
|
|
|
|
return is_metric_event(event) &&
|
|
|
|
event->attr.config <= INTEL_TD_METRIC_AVAILABLE_MAX;
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Add perf core PMU support for Sapphire Rapids
Add perf core PMU support for the Intel Sapphire Rapids server, which is
the successor of the Intel Ice Lake server. The enabling code is based
on Ice Lake, but there are several new features introduced.
The event encoding is changed and simplified, e.g., the event codes
which are below 0x90 are restricted to counters 0-3. The event codes
which above 0x90 are likely to have no restrictions. The event
constraints, extra_regs(), and hardware cache events table are changed
accordingly.
A new Precise Distribution (PDist) facility is introduced, which
further minimizes the skid when a precise event is programmed on the GP
counter 0. Enable the Precise Distribution (PDist) facility with :ppp
event. For this facility to work, the period must be initialized with a
value larger than 127. Add spr_limit_period() to apply the limit for
:ppp event.
Two new data source fields, data block & address block, are added in the
PEBS Memory Info Record for the load latency event. To enable the
feature,
- An auxiliary event has to be enabled together with the load latency
event on Sapphire Rapids. A new flag PMU_FL_MEM_LOADS_AUX is
introduced to indicate the case. A new event, mem-loads-aux, is
exposed to sysfs for the user tool.
Add a check in hw_config(). If the auxiliary event is not detected,
return an unique error -ENODATA.
- The union perf_mem_data_src is extended to support the new fields.
- Ice Lake and earlier models do not support block information, but the
fields may be set by HW on some machines. Add pebs_no_block to
explicitly indicate the previous platforms which don't support the new
block fields. Accessing the new block fields are ignored on those
platforms.
A new store Latency facility is introduced, which leverages the PEBS
facility where it can provide additional information about sampled
stores. The additional information includes the data address, memory
auxiliary info (e.g. Data Source, STLB miss) and the latency of the
store access. To enable the facility, the new event (0x02cd) has to be
programed on the GP counter 0. A new flag PERF_X86_EVENT_PEBS_STLAT is
introduced to indicate the event. The store_latency_data() is introduced
to parse the memory auxiliary info.
The layout of access latency field of PEBS Memory Info Record has been
changed. Two latency, instruction latency (bit 15:0) and cache access
latency (bit 47:32) are recorded.
- The cache access latency is similar to previous memory access latency.
For loads, the latency starts by the actual cache access until the
data is returned by the memory subsystem.
For stores, the latency starts when the demand write accesses the L1
data cache and lasts until the cacheline write is completed in the
memory subsystem.
The cache access latency is stored in low 32bits of the sample type
PERF_SAMPLE_WEIGHT_STRUCT.
- The instruction latency starts by the dispatch of the load operation
for execution and lasts until completion of the instruction it belongs
to.
Add a new flag PMU_FL_INSTR_LATENCY to indicate the instruction
latency support. The instruction latency is stored in the bit 47:32
of the sample type PERF_SAMPLE_WEIGHT_STRUCT.
Extends the PERF_METRICS MSR to feature TMA method level 2 metrics. The
lower half of the register is the TMA level 1 metrics (legacy). The
upper half is also divided into four 8-bit fields for the new level 2
metrics. Expose all eight Topdown metrics events to user space.
The full description for the SPR features can be found at Intel
Architecture Instruction Set Extensions and Future Features
Programming Reference, 319433-041.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-5-git-send-email-kan.liang@linux.intel.com
2021-01-29 06:40:10 +08:00
|
|
|
static inline bool is_mem_loads_event(struct perf_event *event)
|
|
|
|
{
|
|
|
|
return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xcd, .umask=0x01);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool is_mem_loads_aux_event(struct perf_event *event)
|
|
|
|
{
|
|
|
|
return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82);
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Hybrid PMU support for perf capabilities
Some platforms, e.g. Alder Lake, have hybrid architecture. Although most
PMU capabilities are the same, there are still some unique PMU
capabilities for different hybrid PMUs. Perf should register a dedicated
pmu for each hybrid PMU.
Add a new struct x86_hybrid_pmu, which saves the dedicated pmu and
capabilities for each hybrid PMU.
The architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicates the
architecture features which are available on all hybrid PMUs. The
architecture features are stored in the global x86_pmu.intel_cap.
For Alder Lake, the model-specific features are perf metrics and
PEBS-via-PT. The corresponding bits of the global x86_pmu.intel_cap
should be 0 for these two features. Perf should not use the global
intel_cap to check the features on a hybrid system.
Add a dedicated intel_cap in the x86_hybrid_pmu to store the
model-specific capabilities. Use the dedicated intel_cap to replace
the global intel_cap for thse two features. The dedicated intel_cap
will be set in the following "Add Alder Lake Hybrid support" patch.
Add is_hybrid() to distinguish a hybrid system. ADL may have an
alternative configuration. With that configuration, the
X86_FEATURE_HYBRID_CPU is not set. Perf cannot rely on the feature bit.
Add a new static_key_false, perf_is_hybrid, to indicate a hybrid system.
It will be assigned in the following "Add Alder Lake Hybrid support"
patch as well.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-5-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:44 +08:00
|
|
|
static inline bool intel_pmu_has_cap(struct perf_event *event, int idx)
|
|
|
|
{
|
|
|
|
union perf_capabilities *intel_cap = &hybrid(event->pmu, intel_cap);
|
|
|
|
|
|
|
|
return test_bit(idx, (unsigned long *)&intel_cap->capabilities);
|
|
|
|
}
|
perf/x86/intel: Add perf core PMU support for Sapphire Rapids
Add perf core PMU support for the Intel Sapphire Rapids server, which is
the successor of the Intel Ice Lake server. The enabling code is based
on Ice Lake, but there are several new features introduced.
The event encoding is changed and simplified, e.g., the event codes
which are below 0x90 are restricted to counters 0-3. The event codes
which above 0x90 are likely to have no restrictions. The event
constraints, extra_regs(), and hardware cache events table are changed
accordingly.
A new Precise Distribution (PDist) facility is introduced, which
further minimizes the skid when a precise event is programmed on the GP
counter 0. Enable the Precise Distribution (PDist) facility with :ppp
event. For this facility to work, the period must be initialized with a
value larger than 127. Add spr_limit_period() to apply the limit for
:ppp event.
Two new data source fields, data block & address block, are added in the
PEBS Memory Info Record for the load latency event. To enable the
feature,
- An auxiliary event has to be enabled together with the load latency
event on Sapphire Rapids. A new flag PMU_FL_MEM_LOADS_AUX is
introduced to indicate the case. A new event, mem-loads-aux, is
exposed to sysfs for the user tool.
Add a check in hw_config(). If the auxiliary event is not detected,
return an unique error -ENODATA.
- The union perf_mem_data_src is extended to support the new fields.
- Ice Lake and earlier models do not support block information, but the
fields may be set by HW on some machines. Add pebs_no_block to
explicitly indicate the previous platforms which don't support the new
block fields. Accessing the new block fields are ignored on those
platforms.
A new store Latency facility is introduced, which leverages the PEBS
facility where it can provide additional information about sampled
stores. The additional information includes the data address, memory
auxiliary info (e.g. Data Source, STLB miss) and the latency of the
store access. To enable the facility, the new event (0x02cd) has to be
programed on the GP counter 0. A new flag PERF_X86_EVENT_PEBS_STLAT is
introduced to indicate the event. The store_latency_data() is introduced
to parse the memory auxiliary info.
The layout of access latency field of PEBS Memory Info Record has been
changed. Two latency, instruction latency (bit 15:0) and cache access
latency (bit 47:32) are recorded.
- The cache access latency is similar to previous memory access latency.
For loads, the latency starts by the actual cache access until the
data is returned by the memory subsystem.
For stores, the latency starts when the demand write accesses the L1
data cache and lasts until the cacheline write is completed in the
memory subsystem.
The cache access latency is stored in low 32bits of the sample type
PERF_SAMPLE_WEIGHT_STRUCT.
- The instruction latency starts by the dispatch of the load operation
for execution and lasts until completion of the instruction it belongs
to.
Add a new flag PMU_FL_INSTR_LATENCY to indicate the instruction
latency support. The instruction latency is stored in the bit 47:32
of the sample type PERF_SAMPLE_WEIGHT_STRUCT.
Extends the PERF_METRICS MSR to feature TMA method level 2 metrics. The
lower half of the register is the TMA level 1 metrics (legacy). The
upper half is also divided into four 8-bit fields for the new level 2
metrics. Expose all eight Topdown metrics events to user space.
The full description for the SPR features can be found at Intel
Architecture Instruction Set Extensions and Future Features
Programming Reference, 319433-041.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-5-git-send-email-kan.liang@linux.intel.com
2021-01-29 06:40:10 +08:00
|
|
|
|
2012-06-05 16:26:43 +08:00
|
|
|
static int intel_pmu_hw_config(struct perf_event *event)
|
|
|
|
{
|
|
|
|
int ret = x86_pmu_hw_config(event);
|
|
|
|
|
2018-11-21 18:16:10 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = intel_pmu_bts_config(event);
|
2012-06-05 16:26:43 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2015-05-07 03:33:47 +08:00
|
|
|
if (event->attr.precise_ip) {
|
2019-05-14 08:34:00 +08:00
|
|
|
if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
|
2015-05-07 03:33:47 +08:00
|
|
|
event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
|
2015-05-28 12:13:14 +08:00
|
|
|
if (!(event->attr.sample_type &
|
2020-12-01 03:38:41 +08:00
|
|
|
~intel_pmu_large_pebs_flags(event))) {
|
2018-03-12 22:45:37 +08:00
|
|
|
event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
|
2020-12-01 03:38:41 +08:00
|
|
|
event->attach_state |= PERF_ATTACH_SCHED_CB;
|
|
|
|
}
|
perf/x86/intel: Implement batched PEBS interrupt handling (large PEBS interrupt threshold)
PEBS always had the capability to log samples to its buffers without
an interrupt. Traditionally perf has not used this but always set the
PEBS threshold to one.
For frequently occurring events (like cycles or branches or load/store)
this in term requires using a relatively high sampling period to avoid
overloading the system, by only processing PMIs. This in term increases
sampling error.
For the common cases we still need to use the PMI because the PEBS
hardware has various limitations. The biggest one is that it can not
supply a callgraph. It also requires setting a fixed period, as the
hardware does not support adaptive period. Another issue is that it
cannot supply a time stamp and some other options. To supply a TID it
requires flushing on context switch. It can however supply the IP, the
load/store address, TSX information, registers, and some other things.
So we can make PEBS work for some specific cases, basically as long as
you can do without a callgraph and can set the period you can use this
new PEBS mode.
The main benefit is the ability to support much lower sampling period
(down to -c 1000) without extensive overhead.
One use cases is for example to increase the resolution of the c2c tool.
Another is double checking when you suspect the standard sampling has
too much sampling error.
Some numbers on the overhead, using cycle soak, comparing the elapsed
time from "kernbench -M -H" between plain (threshold set to one) and
multi (large threshold).
The test command for plain:
"perf record --time -e cycles:p -c $period -- kernbench -M -H"
The test command for multi:
"perf record --no-time -e cycles:p -c $period -- kernbench -M -H"
( The only difference of test command between multi and plain is time
stamp options. Since time stamp is not supported by large PEBS
threshold, it can be used as a flag to indicate if large threshold is
enabled during the test. )
period plain(Sec) multi(Sec) Delta
10003 32.7 16.5 16.2
20003 30.2 16.2 14.0
40003 18.6 14.1 4.5
80003 16.8 14.6 2.2
100003 16.9 14.1 2.8
800003 15.4 15.7 -0.3
1000003 15.3 15.2 0.2
2000003 15.3 15.1 0.1
With periods below 100003, plain (threshold one) cause much more
overhead. With 10003 sampling period, the Elapsed Time for multi is
even 2X faster than plain.
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1430940834-8964-5-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-07 03:33:50 +08:00
|
|
|
}
|
2015-05-07 03:33:47 +08:00
|
|
|
if (x86_pmu.pebs_aliases)
|
|
|
|
x86_pmu.pebs_aliases(event);
|
2018-05-10 21:48:41 +08:00
|
|
|
|
|
|
|
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
|
|
|
|
event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY;
|
2015-05-07 03:33:47 +08:00
|
|
|
}
|
2010-12-15 04:26:40 +08:00
|
|
|
|
2014-11-05 10:56:06 +08:00
|
|
|
if (needs_branch_stack(event)) {
|
2012-02-10 06:20:57 +08:00
|
|
|
ret = intel_pmu_setup_lbr_filter(event);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2020-12-01 03:38:41 +08:00
|
|
|
event->attach_state |= PERF_ATTACH_SCHED_CB;
|
2015-01-14 20:18:20 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* BTS is set up earlier in this path, so don't account twice
|
|
|
|
*/
|
2018-11-21 18:16:11 +08:00
|
|
|
if (!unlikely(intel_pmu_has_bts(event))) {
|
2015-01-14 20:18:20 +08:00
|
|
|
/* disallow lbr if conflicting events are present */
|
|
|
|
if (x86_add_exclusive(x86_lbr_exclusive_lbr))
|
|
|
|
return -EBUSY;
|
|
|
|
|
|
|
|
event->destroy = hw_perf_lbr_event_destroy;
|
|
|
|
}
|
2012-02-10 06:20:57 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Support PEBS output to PT
If PEBS declares ability to output its data to Intel PT stream, use the
aux_output attribute bit to enable PEBS data output to PT. This requires
a PT event to be present and scheduled in the same context. Unlike the
DS area, the kernel does not extract PEBS records from the PT stream to
generate corresponding records in the perf stream, because that would
require real time in-kernel PT decoding, which is not feasible. The PMI,
however, can still be used.
The output setting is per-CPU, so all PEBS events must be either writing
to PT or to the DS area, therefore, in case of conflict, the conflicting
event will fail to schedule, allowing the rotation logic to alternate
between the PEBS->PT and PEBS->DS events.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: kan.liang@linux.intel.com
Link: https://lkml.kernel.org/r/20190806084606.4021-3-alexander.shishkin@linux.intel.com
2019-08-06 16:46:01 +08:00
|
|
|
if (event->attr.aux_output) {
|
|
|
|
if (!event->attr.precise_ip)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT;
|
|
|
|
}
|
|
|
|
|
2010-03-30 23:00:06 +08:00
|
|
|
if (event->attr.type != PERF_TYPE_RAW)
|
|
|
|
return 0;
|
|
|
|
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
/*
|
|
|
|
* Config Topdown slots and metric events
|
|
|
|
*
|
|
|
|
* The slots event on Fixed Counter 3 can support sampling,
|
|
|
|
* which will be handled normally in x86_perf_event_update().
|
|
|
|
*
|
|
|
|
* Metric events don't support sampling and require being paired
|
|
|
|
* with a slots event as group leader. When the slots event
|
|
|
|
* is used in a metrics group, it too cannot support sampling.
|
|
|
|
*/
|
perf/x86/intel: Hybrid PMU support for perf capabilities
Some platforms, e.g. Alder Lake, have hybrid architecture. Although most
PMU capabilities are the same, there are still some unique PMU
capabilities for different hybrid PMUs. Perf should register a dedicated
pmu for each hybrid PMU.
Add a new struct x86_hybrid_pmu, which saves the dedicated pmu and
capabilities for each hybrid PMU.
The architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicates the
architecture features which are available on all hybrid PMUs. The
architecture features are stored in the global x86_pmu.intel_cap.
For Alder Lake, the model-specific features are perf metrics and
PEBS-via-PT. The corresponding bits of the global x86_pmu.intel_cap
should be 0 for these two features. Perf should not use the global
intel_cap to check the features on a hybrid system.
Add a dedicated intel_cap in the x86_hybrid_pmu to store the
model-specific capabilities. Use the dedicated intel_cap to replace
the global intel_cap for thse two features. The dedicated intel_cap
will be set in the following "Add Alder Lake Hybrid support" patch.
Add is_hybrid() to distinguish a hybrid system. ADL may have an
alternative configuration. With that configuration, the
X86_FEATURE_HYBRID_CPU is not set. Perf cannot rely on the feature bit.
Add a new static_key_false, perf_is_hybrid, to indicate a hybrid system.
It will be assigned in the following "Add Alder Lake Hybrid support"
patch as well.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-5-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:44 +08:00
|
|
|
if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) {
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
if (event->attr.config1 || event->attr.config2)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The TopDown metrics events and slots event don't
|
|
|
|
* support any filters.
|
|
|
|
*/
|
|
|
|
if (event->attr.config & X86_ALL_EVENT_FLAGS)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2021-01-29 06:40:09 +08:00
|
|
|
if (is_available_metric_event(event)) {
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
struct perf_event *leader = event->group_leader;
|
|
|
|
|
|
|
|
/* The metric events don't support sampling. */
|
|
|
|
if (is_sampling_event(event))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/* The metric events require a slots group leader. */
|
|
|
|
if (!is_slots_event(leader))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The leader/SLOTS must not be a sampling event for
|
|
|
|
* metric use; hardware requires it starts at 0 when used
|
|
|
|
* in conjunction with MSR_PERF_METRICS.
|
|
|
|
*/
|
|
|
|
if (is_sampling_event(leader))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
event->event_caps |= PERF_EV_CAP_SIBLING;
|
|
|
|
/*
|
|
|
|
* Only once we have a METRICs sibling do we
|
|
|
|
* need TopDown magic.
|
|
|
|
*/
|
|
|
|
leader->hw.flags |= PERF_X86_EVENT_TOPDOWN;
|
|
|
|
event->hw.flags |= PERF_X86_EVENT_TOPDOWN;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Add perf core PMU support for Sapphire Rapids
Add perf core PMU support for the Intel Sapphire Rapids server, which is
the successor of the Intel Ice Lake server. The enabling code is based
on Ice Lake, but there are several new features introduced.
The event encoding is changed and simplified, e.g., the event codes
which are below 0x90 are restricted to counters 0-3. The event codes
which above 0x90 are likely to have no restrictions. The event
constraints, extra_regs(), and hardware cache events table are changed
accordingly.
A new Precise Distribution (PDist) facility is introduced, which
further minimizes the skid when a precise event is programmed on the GP
counter 0. Enable the Precise Distribution (PDist) facility with :ppp
event. For this facility to work, the period must be initialized with a
value larger than 127. Add spr_limit_period() to apply the limit for
:ppp event.
Two new data source fields, data block & address block, are added in the
PEBS Memory Info Record for the load latency event. To enable the
feature,
- An auxiliary event has to be enabled together with the load latency
event on Sapphire Rapids. A new flag PMU_FL_MEM_LOADS_AUX is
introduced to indicate the case. A new event, mem-loads-aux, is
exposed to sysfs for the user tool.
Add a check in hw_config(). If the auxiliary event is not detected,
return an unique error -ENODATA.
- The union perf_mem_data_src is extended to support the new fields.
- Ice Lake and earlier models do not support block information, but the
fields may be set by HW on some machines. Add pebs_no_block to
explicitly indicate the previous platforms which don't support the new
block fields. Accessing the new block fields are ignored on those
platforms.
A new store Latency facility is introduced, which leverages the PEBS
facility where it can provide additional information about sampled
stores. The additional information includes the data address, memory
auxiliary info (e.g. Data Source, STLB miss) and the latency of the
store access. To enable the facility, the new event (0x02cd) has to be
programed on the GP counter 0. A new flag PERF_X86_EVENT_PEBS_STLAT is
introduced to indicate the event. The store_latency_data() is introduced
to parse the memory auxiliary info.
The layout of access latency field of PEBS Memory Info Record has been
changed. Two latency, instruction latency (bit 15:0) and cache access
latency (bit 47:32) are recorded.
- The cache access latency is similar to previous memory access latency.
For loads, the latency starts by the actual cache access until the
data is returned by the memory subsystem.
For stores, the latency starts when the demand write accesses the L1
data cache and lasts until the cacheline write is completed in the
memory subsystem.
The cache access latency is stored in low 32bits of the sample type
PERF_SAMPLE_WEIGHT_STRUCT.
- The instruction latency starts by the dispatch of the load operation
for execution and lasts until completion of the instruction it belongs
to.
Add a new flag PMU_FL_INSTR_LATENCY to indicate the instruction
latency support. The instruction latency is stored in the bit 47:32
of the sample type PERF_SAMPLE_WEIGHT_STRUCT.
Extends the PERF_METRICS MSR to feature TMA method level 2 metrics. The
lower half of the register is the TMA level 1 metrics (legacy). The
upper half is also divided into four 8-bit fields for the new level 2
metrics. Expose all eight Topdown metrics events to user space.
The full description for the SPR features can be found at Intel
Architecture Instruction Set Extensions and Future Features
Programming Reference, 319433-041.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-5-git-send-email-kan.liang@linux.intel.com
2021-01-29 06:40:10 +08:00
|
|
|
/*
|
|
|
|
* The load latency event X86_CONFIG(.event=0xcd, .umask=0x01) on SPR
|
|
|
|
* doesn't function quite right. As a work-around it needs to always be
|
|
|
|
* co-scheduled with a auxiliary event X86_CONFIG(.event=0x03, .umask=0x82).
|
|
|
|
* The actual count of this second event is irrelevant it just needs
|
|
|
|
* to be active to make the first event function correctly.
|
|
|
|
*
|
|
|
|
* In a group, the auxiliary event must be in front of the load latency
|
|
|
|
* event. The rule is to simplify the implementation of the check.
|
|
|
|
* That's because perf cannot have a complete group at the moment.
|
|
|
|
*/
|
|
|
|
if (x86_pmu.flags & PMU_FL_MEM_LOADS_AUX &&
|
|
|
|
(event->attr.sample_type & PERF_SAMPLE_DATA_SRC) &&
|
|
|
|
is_mem_loads_event(event)) {
|
|
|
|
struct perf_event *leader = event->group_leader;
|
|
|
|
struct perf_event *sibling = NULL;
|
|
|
|
|
|
|
|
if (!is_mem_loads_aux_event(leader)) {
|
|
|
|
for_each_sibling_event(sibling, leader) {
|
|
|
|
if (is_mem_loads_aux_event(sibling))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (list_entry_is_head(sibling, &leader->sibling_list, sibling_list))
|
|
|
|
return -ENODATA;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-03-30 23:00:06 +08:00
|
|
|
if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (x86_pmu.version < 3)
|
|
|
|
return -EINVAL;
|
|
|
|
|
perf_event: Add support for LSM and SELinux checks
In current mainline, the degree of access to perf_event_open(2) system
call depends on the perf_event_paranoid sysctl. This has a number of
limitations:
1. The sysctl is only a single value. Many types of accesses are controlled
based on the single value thus making the control very limited and
coarse grained.
2. The sysctl is global, so if the sysctl is changed, then that means
all processes get access to perf_event_open(2) opening the door to
security issues.
This patch adds LSM and SELinux access checking which will be used in
Android to access perf_event_open(2) for the purposes of attaching BPF
programs to tracepoints, perf profiling and other operations from
userspace. These operations are intended for production systems.
5 new LSM hooks are added:
1. perf_event_open: This controls access during the perf_event_open(2)
syscall itself. The hook is called from all the places that the
perf_event_paranoid sysctl is checked to keep it consistent with the
systctl. The hook gets passed a 'type' argument which controls CPU,
kernel and tracepoint accesses (in this context, CPU, kernel and
tracepoint have the same semantics as the perf_event_paranoid sysctl).
Additionally, I added an 'open' type which is similar to
perf_event_paranoid sysctl == 3 patch carried in Android and several other
distros but was rejected in mainline [1] in 2016.
2. perf_event_alloc: This allocates a new security object for the event
which stores the current SID within the event. It will be useful when
the perf event's FD is passed through IPC to another process which may
try to read the FD. Appropriate security checks will limit access.
3. perf_event_free: Called when the event is closed.
4. perf_event_read: Called from the read(2) and mmap(2) syscalls for the event.
5. perf_event_write: Called from the ioctl(2) syscalls for the event.
[1] https://lwn.net/Articles/696240/
Since Peter had suggest LSM hooks in 2016 [1], I am adding his
Suggested-by tag below.
To use this patch, we set the perf_event_paranoid sysctl to -1 and then
apply selinux checking as appropriate (default deny everything, and then
add policy rules to give access to domains that need it). In the future
we can remove the perf_event_paranoid sysctl altogether.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Co-developed-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: James Morris <jmorris@namei.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: rostedt@goodmis.org
Cc: Yonghong Song <yhs@fb.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: jeffv@google.com
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: primiano@google.com
Cc: Song Liu <songliubraving@fb.com>
Cc: rsavitski@google.com
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Matthew Garrett <matthewgarrett@google.com>
Link: https://lkml.kernel.org/r/20191014170308.70668-1-joel@joelfernandes.org
2019-10-15 01:03:08 +08:00
|
|
|
ret = perf_allow_cpu(&event->attr);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2010-03-30 23:00:06 +08:00
|
|
|
|
|
|
|
event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-10-05 20:01:21 +08:00
|
|
|
static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
|
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2011-10-05 20:01:21 +08:00
|
|
|
struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
|
2021-04-12 22:30:45 +08:00
|
|
|
u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
|
2011-10-05 20:01:21 +08:00
|
|
|
|
|
|
|
arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
|
2021-04-12 22:30:45 +08:00
|
|
|
arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
|
|
|
|
arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask;
|
2019-02-05 06:23:30 +08:00
|
|
|
if (x86_pmu.flags & PMU_FL_PEBS_ALL)
|
|
|
|
arr[0].guest &= ~cpuc->pebs_enabled;
|
|
|
|
else
|
|
|
|
arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
|
|
|
|
*nr = 1;
|
|
|
|
|
|
|
|
if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) {
|
|
|
|
/*
|
|
|
|
* If PMU counter has PEBS enabled it is not enough to
|
|
|
|
* disable counter on a guest entry since PEBS memory
|
|
|
|
* write can overshoot guest entry and corrupt guest
|
|
|
|
* memory. Disabling PEBS solves the problem.
|
|
|
|
*
|
|
|
|
* Don't do this if the CPU already enforces it.
|
|
|
|
*/
|
|
|
|
arr[1].msr = MSR_IA32_PEBS_ENABLE;
|
|
|
|
arr[1].host = cpuc->pebs_enabled;
|
|
|
|
arr[1].guest = 0;
|
|
|
|
*nr = 2;
|
|
|
|
}
|
2011-10-05 20:01:21 +08:00
|
|
|
|
|
|
|
return arr;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
|
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2011-10-05 20:01:21 +08:00
|
|
|
struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
|
|
|
|
int idx;
|
|
|
|
|
|
|
|
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
|
|
|
|
struct perf_event *event = cpuc->events[idx];
|
|
|
|
|
|
|
|
arr[idx].msr = x86_pmu_config_addr(idx);
|
|
|
|
arr[idx].host = arr[idx].guest = 0;
|
|
|
|
|
|
|
|
if (!test_bit(idx, cpuc->active_mask))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
arr[idx].host = arr[idx].guest =
|
|
|
|
event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
|
|
|
|
|
|
|
|
if (event->attr.exclude_host)
|
|
|
|
arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
|
|
|
|
else if (event->attr.exclude_guest)
|
|
|
|
arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
|
|
|
|
}
|
|
|
|
|
|
|
|
*nr = x86_pmu.num_counters;
|
|
|
|
return arr;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void core_pmu_enable_event(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (!event->attr.exclude_host)
|
|
|
|
x86_pmu_enable_event(event);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void core_pmu_enable_all(int added)
|
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2011-10-05 20:01:21 +08:00
|
|
|
int idx;
|
|
|
|
|
|
|
|
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
|
|
|
|
struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
|
|
|
|
|
|
|
|
if (!test_bit(idx, cpuc->active_mask) ||
|
|
|
|
cpuc->events[idx]->attr.exclude_host)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
static int hsw_hw_config(struct perf_event *event)
|
|
|
|
{
|
|
|
|
int ret = intel_pmu_hw_config(event);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
|
|
|
|
return 0;
|
|
|
|
event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
|
|
|
|
* PEBS or in ANY thread mode. Since the results are non-sensical forbid
|
|
|
|
* this combination.
|
|
|
|
*/
|
|
|
|
if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
|
|
|
|
((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
|
|
|
|
event->attr.precise_ip > 0))
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
2013-09-06 11:37:38 +08:00
|
|
|
if (event_is_checkpointed(event)) {
|
|
|
|
/*
|
|
|
|
* Sampling of checkpointed events can cause situations where
|
|
|
|
* the CPU constantly aborts because of a overflow, which is
|
|
|
|
* then checkpointed back and ignored. Forbid checkpointing
|
|
|
|
* for sampling.
|
|
|
|
*
|
|
|
|
* But still allow a long sampling period, so that perf stat
|
|
|
|
* from KVM works.
|
|
|
|
*/
|
|
|
|
if (event->attr.sample_period > 0 &&
|
|
|
|
event->attr.sample_period < 0x7fffffff)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
2013-06-18 08:36:48 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-07-12 21:44:23 +08:00
|
|
|
static struct event_constraint counter0_constraint =
|
|
|
|
INTEL_ALL_EVENT_CONSTRAINT(0, 0x1);
|
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
static struct event_constraint counter2_constraint =
|
|
|
|
EVENT_CONSTRAINT(0, 0x4, 0);
|
|
|
|
|
2019-04-03 03:45:05 +08:00
|
|
|
static struct event_constraint fixed0_constraint =
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0);
|
|
|
|
|
2019-04-11 02:57:09 +08:00
|
|
|
static struct event_constraint fixed0_counter0_constraint =
|
|
|
|
INTEL_ALL_EVENT_CONSTRAINT(0, 0x100000001ULL);
|
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
static struct event_constraint *
|
2014-11-18 03:06:56 +08:00
|
|
|
hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
2013-06-18 08:36:48 +08:00
|
|
|
{
|
2014-11-18 03:06:56 +08:00
|
|
|
struct event_constraint *c;
|
|
|
|
|
|
|
|
c = intel_get_event_constraints(cpuc, idx, event);
|
2013-06-18 08:36:48 +08:00
|
|
|
|
|
|
|
/* Handle special quirk on in_tx_checkpointed only in counter 2 */
|
|
|
|
if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
|
|
|
|
if (c->idxmsk64 & (1U << 2))
|
|
|
|
return &counter2_constraint;
|
|
|
|
return &emptyconstraint;
|
|
|
|
}
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2019-04-03 03:45:05 +08:00
|
|
|
static struct event_constraint *
|
|
|
|
icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Fixed counter 0 has less skid.
|
|
|
|
* Force instruction:ppp in Fixed counter 0
|
|
|
|
*/
|
|
|
|
if ((event->attr.precise_ip == 3) &&
|
|
|
|
constraint_match(&fixed0_constraint, event->hw.config))
|
|
|
|
return &fixed0_constraint;
|
|
|
|
|
|
|
|
return hsw_get_event_constraints(cpuc, idx, event);
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Add perf core PMU support for Sapphire Rapids
Add perf core PMU support for the Intel Sapphire Rapids server, which is
the successor of the Intel Ice Lake server. The enabling code is based
on Ice Lake, but there are several new features introduced.
The event encoding is changed and simplified, e.g., the event codes
which are below 0x90 are restricted to counters 0-3. The event codes
which above 0x90 are likely to have no restrictions. The event
constraints, extra_regs(), and hardware cache events table are changed
accordingly.
A new Precise Distribution (PDist) facility is introduced, which
further minimizes the skid when a precise event is programmed on the GP
counter 0. Enable the Precise Distribution (PDist) facility with :ppp
event. For this facility to work, the period must be initialized with a
value larger than 127. Add spr_limit_period() to apply the limit for
:ppp event.
Two new data source fields, data block & address block, are added in the
PEBS Memory Info Record for the load latency event. To enable the
feature,
- An auxiliary event has to be enabled together with the load latency
event on Sapphire Rapids. A new flag PMU_FL_MEM_LOADS_AUX is
introduced to indicate the case. A new event, mem-loads-aux, is
exposed to sysfs for the user tool.
Add a check in hw_config(). If the auxiliary event is not detected,
return an unique error -ENODATA.
- The union perf_mem_data_src is extended to support the new fields.
- Ice Lake and earlier models do not support block information, but the
fields may be set by HW on some machines. Add pebs_no_block to
explicitly indicate the previous platforms which don't support the new
block fields. Accessing the new block fields are ignored on those
platforms.
A new store Latency facility is introduced, which leverages the PEBS
facility where it can provide additional information about sampled
stores. The additional information includes the data address, memory
auxiliary info (e.g. Data Source, STLB miss) and the latency of the
store access. To enable the facility, the new event (0x02cd) has to be
programed on the GP counter 0. A new flag PERF_X86_EVENT_PEBS_STLAT is
introduced to indicate the event. The store_latency_data() is introduced
to parse the memory auxiliary info.
The layout of access latency field of PEBS Memory Info Record has been
changed. Two latency, instruction latency (bit 15:0) and cache access
latency (bit 47:32) are recorded.
- The cache access latency is similar to previous memory access latency.
For loads, the latency starts by the actual cache access until the
data is returned by the memory subsystem.
For stores, the latency starts when the demand write accesses the L1
data cache and lasts until the cacheline write is completed in the
memory subsystem.
The cache access latency is stored in low 32bits of the sample type
PERF_SAMPLE_WEIGHT_STRUCT.
- The instruction latency starts by the dispatch of the load operation
for execution and lasts until completion of the instruction it belongs
to.
Add a new flag PMU_FL_INSTR_LATENCY to indicate the instruction
latency support. The instruction latency is stored in the bit 47:32
of the sample type PERF_SAMPLE_WEIGHT_STRUCT.
Extends the PERF_METRICS MSR to feature TMA method level 2 metrics. The
lower half of the register is the TMA level 1 metrics (legacy). The
upper half is also divided into four 8-bit fields for the new level 2
metrics. Expose all eight Topdown metrics events to user space.
The full description for the SPR features can be found at Intel
Architecture Instruction Set Extensions and Future Features
Programming Reference, 319433-041.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-5-git-send-email-kan.liang@linux.intel.com
2021-01-29 06:40:10 +08:00
|
|
|
static struct event_constraint *
|
|
|
|
spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct event_constraint *c;
|
|
|
|
|
|
|
|
c = icl_get_event_constraints(cpuc, idx, event);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The :ppp indicates the Precise Distribution (PDist) facility, which
|
|
|
|
* is only supported on the GP counter 0. If a :ppp event which is not
|
|
|
|
* available on the GP counter 0, error out.
|
|
|
|
*/
|
|
|
|
if (event->attr.precise_ip == 3) {
|
|
|
|
if (c->idxmsk64 & BIT_ULL(0))
|
|
|
|
return &counter0_constraint;
|
|
|
|
|
|
|
|
return &emptyconstraint;
|
|
|
|
}
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2017-07-12 21:44:23 +08:00
|
|
|
static struct event_constraint *
|
|
|
|
glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct event_constraint *c;
|
|
|
|
|
|
|
|
/* :ppp means to do reduced skid PEBS which is PMC0 only. */
|
|
|
|
if (event->attr.precise_ip == 3)
|
|
|
|
return &counter0_constraint;
|
|
|
|
|
|
|
|
c = intel_get_event_constraints(cpuc, idx, event);
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2019-04-11 02:57:09 +08:00
|
|
|
static struct event_constraint *
|
|
|
|
tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct event_constraint *c;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* :ppp means to do reduced skid PEBS,
|
|
|
|
* which is available on PMC0 and fixed counter 0.
|
|
|
|
*/
|
|
|
|
if (event->attr.precise_ip == 3) {
|
|
|
|
/* Force instruction:ppp on PMC0 and Fixed counter 0 */
|
|
|
|
if (constraint_match(&fixed0_constraint, event->hw.config))
|
|
|
|
return &fixed0_counter0_constraint;
|
|
|
|
|
|
|
|
return &counter0_constraint;
|
|
|
|
}
|
|
|
|
|
|
|
|
c = intel_get_event_constraints(cpuc, idx, event);
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:18 +08:00
|
|
|
static bool allow_tsx_force_abort = true;
|
|
|
|
|
|
|
|
static struct event_constraint *
|
|
|
|
tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct event_constraint *c = hsw_get_event_constraints(cpuc, idx, event);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Without TFA we must not use PMC3.
|
|
|
|
*/
|
2019-03-14 16:57:57 +08:00
|
|
|
if (!allow_tsx_force_abort && test_bit(3, c->idxmsk)) {
|
2019-03-06 05:23:18 +08:00
|
|
|
c = dyn_constraint(cpuc, c, idx);
|
|
|
|
c->idxmsk64 &= ~(1ULL << 3);
|
|
|
|
c->weight--;
|
|
|
|
}
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Add INST_RETIRED.ALL workarounds
On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.
This is erratum BDM11 and BDM55:
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.
Add a new callback to enforce this, and set it for Broadwell.
How does this handle the case when an app requests a specific
period with some of the bottom bits set?
Short answer:
Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.
Long answer (by Peterz):
IFF we guarantee perf_event_attr::sample_period >= 128.
Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.
So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.
So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.
So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-02-18 10:18:06 +08:00
|
|
|
/*
|
|
|
|
* Broadwell:
|
|
|
|
*
|
|
|
|
* The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
|
|
|
|
* (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
|
|
|
|
* the two to enforce a minimum period of 128 (the smallest value that has bits
|
|
|
|
* 0-5 cleared and >= 100).
|
|
|
|
*
|
|
|
|
* Because of how the code in x86_perf_event_set_period() works, the truncation
|
|
|
|
* of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
|
|
|
|
* to make up for the 'lost' events due to carrying the 'error' in period_left.
|
|
|
|
*
|
|
|
|
* Therefore the effective (average) period matches the requested period,
|
|
|
|
* despite coarser hardware granularity.
|
|
|
|
*/
|
2018-03-02 01:54:54 +08:00
|
|
|
static u64 bdw_limit_period(struct perf_event *event, u64 left)
|
perf/x86/intel: Add INST_RETIRED.ALL workarounds
On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.
This is erratum BDM11 and BDM55:
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.
Add a new callback to enforce this, and set it for Broadwell.
How does this handle the case when an app requests a specific
period with some of the bottom bits set?
Short answer:
Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.
Long answer (by Peterz):
IFF we guarantee perf_event_attr::sample_period >= 128.
Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.
So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.
So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.
So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-02-18 10:18:06 +08:00
|
|
|
{
|
|
|
|
if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
|
|
|
|
X86_CONFIG(.event=0xc0, .umask=0x01)) {
|
|
|
|
if (left < 128)
|
|
|
|
left = 128;
|
2018-03-17 19:52:16 +08:00
|
|
|
left &= ~0x3fULL;
|
perf/x86/intel: Add INST_RETIRED.ALL workarounds
On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.
This is erratum BDM11 and BDM55:
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.
Add a new callback to enforce this, and set it for Broadwell.
How does this handle the case when an app requests a specific
period with some of the bottom bits set?
Short answer:
Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.
Long answer (by Peterz):
IFF we guarantee perf_event_attr::sample_period >= 128.
Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.
So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.
So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.
So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-02-18 10:18:06 +08:00
|
|
|
}
|
|
|
|
return left;
|
|
|
|
}
|
|
|
|
|
2019-08-20 07:13:31 +08:00
|
|
|
static u64 nhm_limit_period(struct perf_event *event, u64 left)
|
|
|
|
{
|
|
|
|
return max(left, 32ULL);
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Add perf core PMU support for Sapphire Rapids
Add perf core PMU support for the Intel Sapphire Rapids server, which is
the successor of the Intel Ice Lake server. The enabling code is based
on Ice Lake, but there are several new features introduced.
The event encoding is changed and simplified, e.g., the event codes
which are below 0x90 are restricted to counters 0-3. The event codes
which above 0x90 are likely to have no restrictions. The event
constraints, extra_regs(), and hardware cache events table are changed
accordingly.
A new Precise Distribution (PDist) facility is introduced, which
further minimizes the skid when a precise event is programmed on the GP
counter 0. Enable the Precise Distribution (PDist) facility with :ppp
event. For this facility to work, the period must be initialized with a
value larger than 127. Add spr_limit_period() to apply the limit for
:ppp event.
Two new data source fields, data block & address block, are added in the
PEBS Memory Info Record for the load latency event. To enable the
feature,
- An auxiliary event has to be enabled together with the load latency
event on Sapphire Rapids. A new flag PMU_FL_MEM_LOADS_AUX is
introduced to indicate the case. A new event, mem-loads-aux, is
exposed to sysfs for the user tool.
Add a check in hw_config(). If the auxiliary event is not detected,
return an unique error -ENODATA.
- The union perf_mem_data_src is extended to support the new fields.
- Ice Lake and earlier models do not support block information, but the
fields may be set by HW on some machines. Add pebs_no_block to
explicitly indicate the previous platforms which don't support the new
block fields. Accessing the new block fields are ignored on those
platforms.
A new store Latency facility is introduced, which leverages the PEBS
facility where it can provide additional information about sampled
stores. The additional information includes the data address, memory
auxiliary info (e.g. Data Source, STLB miss) and the latency of the
store access. To enable the facility, the new event (0x02cd) has to be
programed on the GP counter 0. A new flag PERF_X86_EVENT_PEBS_STLAT is
introduced to indicate the event. The store_latency_data() is introduced
to parse the memory auxiliary info.
The layout of access latency field of PEBS Memory Info Record has been
changed. Two latency, instruction latency (bit 15:0) and cache access
latency (bit 47:32) are recorded.
- The cache access latency is similar to previous memory access latency.
For loads, the latency starts by the actual cache access until the
data is returned by the memory subsystem.
For stores, the latency starts when the demand write accesses the L1
data cache and lasts until the cacheline write is completed in the
memory subsystem.
The cache access latency is stored in low 32bits of the sample type
PERF_SAMPLE_WEIGHT_STRUCT.
- The instruction latency starts by the dispatch of the load operation
for execution and lasts until completion of the instruction it belongs
to.
Add a new flag PMU_FL_INSTR_LATENCY to indicate the instruction
latency support. The instruction latency is stored in the bit 47:32
of the sample type PERF_SAMPLE_WEIGHT_STRUCT.
Extends the PERF_METRICS MSR to feature TMA method level 2 metrics. The
lower half of the register is the TMA level 1 metrics (legacy). The
upper half is also divided into four 8-bit fields for the new level 2
metrics. Expose all eight Topdown metrics events to user space.
The full description for the SPR features can be found at Intel
Architecture Instruction Set Extensions and Future Features
Programming Reference, 319433-041.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-5-git-send-email-kan.liang@linux.intel.com
2021-01-29 06:40:10 +08:00
|
|
|
static u64 spr_limit_period(struct perf_event *event, u64 left)
|
|
|
|
{
|
|
|
|
if (event->attr.precise_ip == 3)
|
|
|
|
return max(left, 128ULL);
|
|
|
|
|
|
|
|
return left;
|
|
|
|
}
|
|
|
|
|
2012-03-16 03:09:14 +08:00
|
|
|
PMU_FORMAT_ATTR(event, "config:0-7" );
|
|
|
|
PMU_FORMAT_ATTR(umask, "config:8-15" );
|
|
|
|
PMU_FORMAT_ATTR(edge, "config:18" );
|
|
|
|
PMU_FORMAT_ATTR(pc, "config:19" );
|
|
|
|
PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */
|
|
|
|
PMU_FORMAT_ATTR(inv, "config:23" );
|
|
|
|
PMU_FORMAT_ATTR(cmask, "config:24-31" );
|
2013-06-18 08:36:48 +08:00
|
|
|
PMU_FORMAT_ATTR(in_tx, "config:32");
|
|
|
|
PMU_FORMAT_ATTR(in_tx_cp, "config:33");
|
2012-03-16 03:09:14 +08:00
|
|
|
|
|
|
|
static struct attribute *intel_arch_formats_attr[] = {
|
|
|
|
&format_attr_event.attr,
|
|
|
|
&format_attr_umask.attr,
|
|
|
|
&format_attr_edge.attr,
|
|
|
|
&format_attr_pc.attr,
|
|
|
|
&format_attr_inv.attr,
|
|
|
|
&format_attr_cmask.attr,
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2012-10-10 20:53:14 +08:00
|
|
|
ssize_t intel_event_sysfs_show(char *page, u64 config)
|
|
|
|
{
|
|
|
|
u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
|
|
|
|
|
|
|
|
return x86_event_sysfs_show(page, config, event);
|
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:15 +08:00
|
|
|
static struct intel_shared_regs *allocate_shared_regs(int cpu)
|
2011-06-06 22:57:03 +08:00
|
|
|
{
|
|
|
|
struct intel_shared_regs *regs;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
regs = kzalloc_node(sizeof(struct intel_shared_regs),
|
|
|
|
GFP_KERNEL, cpu_to_node(cpu));
|
|
|
|
if (regs) {
|
|
|
|
/*
|
|
|
|
* initialize the locks to keep lockdep happy
|
|
|
|
*/
|
|
|
|
for (i = 0; i < EXTRA_REG_MAX; i++)
|
|
|
|
raw_spin_lock_init(®s->regs[i].lock);
|
|
|
|
|
|
|
|
regs->core_id = -1;
|
|
|
|
}
|
|
|
|
return regs;
|
|
|
|
}
|
|
|
|
|
2014-11-18 03:06:57 +08:00
|
|
|
static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
|
|
|
|
{
|
|
|
|
struct intel_excl_cntrs *c;
|
|
|
|
|
|
|
|
c = kzalloc_node(sizeof(struct intel_excl_cntrs),
|
|
|
|
GFP_KERNEL, cpu_to_node(cpu));
|
|
|
|
if (c) {
|
|
|
|
raw_spin_lock_init(&c->lock);
|
|
|
|
c->core_id = -1;
|
|
|
|
}
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2019-03-06 05:23:15 +08:00
|
|
|
int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
|
|
|
|
{
|
perf/x86/intel: Support adaptive PEBS v4
Adaptive PEBS is a new way to report PEBS sampling information. Instead
of a fixed size record for all PEBS events it allows to configure the
PEBS record to only include the information needed. Events can then opt
in to use such an extended record, or stay with a basic record which
only contains the IP.
The major new feature is to support LBRs in PEBS record.
Besides normal LBR, this allows (much faster) large PEBS, while still
supporting callstacks through callstack LBR. So essentially a lot of
profiling can now be done without frequent interrupts, dropping the
overhead significantly.
The main requirement still is to use a period, and not use frequency
mode, because frequency mode requires reevaluating the frequency on each
overflow.
The floating point state (XMM) is also supported, which allows efficient
profiling of FP function arguments.
Introduce specific drain function to handle variable length records.
Use a new callback to parse the new record format, and also handle the
STATUS field now being at a different offset.
Add code to set up the configuration register. Since there is only a
single register, all events either get the full super set of all events,
or only the basic record.
Originally-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Cc: jolsa@kernel.org
Link: https://lkml.kernel.org/r/20190402194509.2832-6-kan.liang@linux.intel.com
[ Renamed GPRS => GP. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-04-03 03:45:02 +08:00
|
|
|
cpuc->pebs_record_size = x86_pmu.pebs_record_size;
|
|
|
|
|
2021-04-12 22:30:50 +08:00
|
|
|
if (is_hybrid() || x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
|
2014-11-18 03:06:57 +08:00
|
|
|
cpuc->shared_regs = allocate_shared_regs(cpu);
|
|
|
|
if (!cpuc->shared_regs)
|
2015-08-10 20:17:34 +08:00
|
|
|
goto err;
|
2014-11-18 03:06:57 +08:00
|
|
|
}
|
2011-03-03 10:34:50 +08:00
|
|
|
|
2019-03-06 05:23:18 +08:00
|
|
|
if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA)) {
|
2014-11-18 03:06:57 +08:00
|
|
|
size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
|
|
|
|
|
2019-03-06 05:23:15 +08:00
|
|
|
cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
|
2014-11-18 03:06:57 +08:00
|
|
|
if (!cpuc->constraint_list)
|
2015-08-10 20:17:34 +08:00
|
|
|
goto err_shared_regs;
|
2019-03-06 05:23:18 +08:00
|
|
|
}
|
2014-11-18 03:06:57 +08:00
|
|
|
|
2019-03-06 05:23:18 +08:00
|
|
|
if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
|
2014-11-18 03:06:57 +08:00
|
|
|
cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
|
2015-08-10 20:17:34 +08:00
|
|
|
if (!cpuc->excl_cntrs)
|
|
|
|
goto err_constraint_list;
|
|
|
|
|
2014-11-18 03:06:57 +08:00
|
|
|
cpuc->excl_thread_id = 0;
|
|
|
|
}
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2016-07-14 01:16:10 +08:00
|
|
|
return 0;
|
2015-08-10 20:17:34 +08:00
|
|
|
|
|
|
|
err_constraint_list:
|
|
|
|
kfree(cpuc->constraint_list);
|
|
|
|
cpuc->constraint_list = NULL;
|
|
|
|
|
|
|
|
err_shared_regs:
|
|
|
|
kfree(cpuc->shared_regs);
|
|
|
|
cpuc->shared_regs = NULL;
|
|
|
|
|
|
|
|
err:
|
2016-07-14 01:16:10 +08:00
|
|
|
return -ENOMEM;
|
2011-03-03 10:34:47 +08:00
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:15 +08:00
|
|
|
static int intel_pmu_cpu_prepare(int cpu)
|
|
|
|
{
|
|
|
|
return intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu);
|
|
|
|
}
|
|
|
|
|
2017-05-12 22:51:13 +08:00
|
|
|
static void flip_smm_bit(void *data)
|
|
|
|
{
|
|
|
|
unsigned long set = *(unsigned long *)data;
|
|
|
|
|
|
|
|
if (set > 0) {
|
|
|
|
msr_set_bit(MSR_IA32_DEBUGCTLMSR,
|
|
|
|
DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
|
|
|
|
} else {
|
|
|
|
msr_clear_bit(MSR_IA32_DEBUGCTLMSR,
|
|
|
|
DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-03-05 20:49:35 +08:00
|
|
|
static void intel_pmu_cpu_starting(int cpu)
|
|
|
|
{
|
2011-03-03 10:34:47 +08:00
|
|
|
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
|
|
|
|
int core_id = topology_core_id(cpu);
|
|
|
|
int i;
|
|
|
|
|
2011-03-03 10:34:50 +08:00
|
|
|
init_debug_store_on_cpu(cpu);
|
|
|
|
/*
|
|
|
|
* Deal with CPUs that don't clear their LBRs on power-up.
|
|
|
|
*/
|
|
|
|
intel_pmu_lbr_reset();
|
|
|
|
|
2012-02-10 06:20:53 +08:00
|
|
|
cpuc->lbr_sel = NULL;
|
|
|
|
|
2019-03-21 20:38:49 +08:00
|
|
|
if (x86_pmu.flags & PMU_FL_TFA) {
|
|
|
|
WARN_ON_ONCE(cpuc->tfa_shadow);
|
|
|
|
cpuc->tfa_shadow = ~0ULL;
|
|
|
|
intel_set_tfa(cpuc, false);
|
|
|
|
}
|
|
|
|
|
2018-04-26 02:57:17 +08:00
|
|
|
if (x86_pmu.version > 1)
|
|
|
|
flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
|
2017-05-12 22:51:13 +08:00
|
|
|
|
perf/x86/intel: Hybrid PMU support for perf capabilities
Some platforms, e.g. Alder Lake, have hybrid architecture. Although most
PMU capabilities are the same, there are still some unique PMU
capabilities for different hybrid PMUs. Perf should register a dedicated
pmu for each hybrid PMU.
Add a new struct x86_hybrid_pmu, which saves the dedicated pmu and
capabilities for each hybrid PMU.
The architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicates the
architecture features which are available on all hybrid PMUs. The
architecture features are stored in the global x86_pmu.intel_cap.
For Alder Lake, the model-specific features are perf metrics and
PEBS-via-PT. The corresponding bits of the global x86_pmu.intel_cap
should be 0 for these two features. Perf should not use the global
intel_cap to check the features on a hybrid system.
Add a dedicated intel_cap in the x86_hybrid_pmu to store the
model-specific capabilities. Use the dedicated intel_cap to replace
the global intel_cap for thse two features. The dedicated intel_cap
will be set in the following "Add Alder Lake Hybrid support" patch.
Add is_hybrid() to distinguish a hybrid system. ADL may have an
alternative configuration. With that configuration, the
X86_FEATURE_HYBRID_CPU is not set. Perf cannot rely on the feature bit.
Add a new static_key_false, perf_is_hybrid, to indicate a hybrid system.
It will be assigned in the following "Add Alder Lake Hybrid support"
patch as well.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-5-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:44 +08:00
|
|
|
/*
|
|
|
|
* Disable perf metrics if any added CPU doesn't support it.
|
|
|
|
*
|
|
|
|
* Turn off the check for a hybrid architecture, because the
|
|
|
|
* architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicate
|
|
|
|
* the architecture features. The perf metrics is a model-specific
|
|
|
|
* feature for now. The corresponding bit should always be 0 on
|
|
|
|
* a hybrid platform, e.g., Alder Lake.
|
|
|
|
*/
|
|
|
|
if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) {
|
2020-10-02 05:17:11 +08:00
|
|
|
union perf_capabilities perf_cap;
|
|
|
|
|
|
|
|
rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities);
|
|
|
|
if (!perf_cap.perf_metrics) {
|
|
|
|
x86_pmu.intel_cap.perf_metrics = 0;
|
|
|
|
x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-02-10 06:20:53 +08:00
|
|
|
if (!cpuc->shared_regs)
|
2011-03-03 10:34:50 +08:00
|
|
|
return;
|
|
|
|
|
2014-11-18 03:06:53 +08:00
|
|
|
if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
|
2015-05-26 21:11:28 +08:00
|
|
|
for_each_cpu(i, topology_sibling_cpumask(cpu)) {
|
2012-02-10 06:20:53 +08:00
|
|
|
struct intel_shared_regs *pc;
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2012-02-10 06:20:53 +08:00
|
|
|
pc = per_cpu(cpu_hw_events, i).shared_regs;
|
|
|
|
if (pc && pc->core_id == core_id) {
|
2016-01-28 06:31:09 +08:00
|
|
|
cpuc->kfree_on_online[0] = cpuc->shared_regs;
|
2012-02-10 06:20:53 +08:00
|
|
|
cpuc->shared_regs = pc;
|
|
|
|
break;
|
|
|
|
}
|
2011-03-03 10:34:47 +08:00
|
|
|
}
|
2012-02-10 06:20:53 +08:00
|
|
|
cpuc->shared_regs->core_id = core_id;
|
|
|
|
cpuc->shared_regs->refcnt++;
|
2011-03-03 10:34:47 +08:00
|
|
|
}
|
|
|
|
|
2012-02-10 06:20:53 +08:00
|
|
|
if (x86_pmu.lbr_sel_map)
|
|
|
|
cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
|
2014-11-18 03:06:57 +08:00
|
|
|
|
|
|
|
if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
|
2015-05-26 21:11:28 +08:00
|
|
|
for_each_cpu(i, topology_sibling_cpumask(cpu)) {
|
2017-01-16 11:21:11 +08:00
|
|
|
struct cpu_hw_events *sibling;
|
2014-11-18 03:06:57 +08:00
|
|
|
struct intel_excl_cntrs *c;
|
|
|
|
|
2017-01-16 11:21:11 +08:00
|
|
|
sibling = &per_cpu(cpu_hw_events, i);
|
|
|
|
c = sibling->excl_cntrs;
|
2014-11-18 03:06:57 +08:00
|
|
|
if (c && c->core_id == core_id) {
|
|
|
|
cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
|
|
|
|
cpuc->excl_cntrs = c;
|
2017-01-16 11:21:11 +08:00
|
|
|
if (!sibling->excl_thread_id)
|
|
|
|
cpuc->excl_thread_id = 1;
|
2014-11-18 03:06:57 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
cpuc->excl_cntrs->core_id = core_id;
|
|
|
|
cpuc->excl_cntrs->refcnt++;
|
|
|
|
}
|
2010-03-05 20:49:35 +08:00
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:15 +08:00
|
|
|
static void free_excl_cntrs(struct cpu_hw_events *cpuc)
|
2010-03-05 20:49:35 +08:00
|
|
|
{
|
2014-11-18 03:06:57 +08:00
|
|
|
struct intel_excl_cntrs *c;
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2014-11-18 03:06:57 +08:00
|
|
|
c = cpuc->excl_cntrs;
|
|
|
|
if (c) {
|
|
|
|
if (c->core_id == -1 || --c->refcnt == 0)
|
|
|
|
kfree(c);
|
|
|
|
cpuc->excl_cntrs = NULL;
|
|
|
|
}
|
2019-03-06 05:23:18 +08:00
|
|
|
|
|
|
|
kfree(cpuc->constraint_list);
|
|
|
|
cpuc->constraint_list = NULL;
|
2014-11-18 03:07:04 +08:00
|
|
|
}
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2014-11-18 03:07:04 +08:00
|
|
|
static void intel_pmu_cpu_dying(int cpu)
|
2018-12-20 00:53:50 +08:00
|
|
|
{
|
|
|
|
fini_debug_store_on_cpu(cpu);
|
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:15 +08:00
|
|
|
void intel_cpuc_finish(struct cpu_hw_events *cpuc)
|
2014-11-18 03:07:04 +08:00
|
|
|
{
|
|
|
|
struct intel_shared_regs *pc;
|
|
|
|
|
|
|
|
pc = cpuc->shared_regs;
|
|
|
|
if (pc) {
|
|
|
|
if (pc->core_id == -1 || --pc->refcnt == 0)
|
|
|
|
kfree(pc);
|
|
|
|
cpuc->shared_regs = NULL;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:15 +08:00
|
|
|
free_excl_cntrs(cpuc);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pmu_cpu_dead(int cpu)
|
|
|
|
{
|
|
|
|
intel_cpuc_finish(&per_cpu(cpu_hw_events, cpu));
|
2010-03-05 20:49:35 +08:00
|
|
|
}
|
|
|
|
|
2015-05-07 03:33:51 +08:00
|
|
|
static void intel_pmu_sched_task(struct perf_event_context *ctx,
|
|
|
|
bool sched_in)
|
|
|
|
{
|
perf/x86/intel: Add proper condition to run sched_task callbacks
We have 2 functions using the same sched_task callback:
- PEBS drain for free running counters
- LBR save/store
Both of them are called from intel_pmu_sched_task() and
either of them can be unwillingly triggered when the
other one is configured to run.
Let's say there's PEBS drain configured in sched_task
callback for the event, but in the callback itself
(intel_pmu_sched_task()) we will also run the code for
LBR save/restore, which we did not ask for, but the
code in intel_pmu_sched_task() does not check for that.
This can lead to extra cycles in some perf monitoring,
like when we monitor PEBS event without LBR data.
# perf record --no-timestamp -c 10000 -e cycles:p ./perf bench sched pipe -l 1000000
(We need PEBS, non freq/non timestamp event to enable
the sched_task callback)
The perf stat of cycles and msr:write_msr for above
command before the change:
...
Performance counter stats for './perf record --no-timestamp -c 10000 -e cycles:p \
./perf bench sched pipe -l 1000000' (5 runs):
18,519,557,441 cycles:k
91,195,527 msr:write_msr
29.334476406 seconds time elapsed
And after the change:
...
Performance counter stats for './perf record --no-timestamp -c 10000 -e cycles:p \
./perf bench sched pipe -l 1000000' (5 runs):
18,704,973,540 cycles:k
27,184,720 msr:write_msr
16.977875900 seconds time elapsed
There's no affect on cycles:k because the sched_task happens
with events switched off, however the msr:write_msr tracepoint
counter together with almost 50% of time speedup show the
improvement.
Monitoring LBR event and having extra PEBS drain processing
in sched_task callback showed just a little speedup, because
the drain function does not do much extra work in case there
is no PEBS data.
Adding conditions to recognize the configured work that needs
to be done in the x86_pmu's sched_task callback.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/20170719075247.GA27506@krava
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-19 15:52:47 +08:00
|
|
|
intel_pmu_pebs_sched_task(ctx, sched_in);
|
|
|
|
intel_pmu_lbr_sched_task(ctx, sched_in);
|
2015-05-07 03:33:51 +08:00
|
|
|
}
|
|
|
|
|
2019-10-23 15:13:56 +08:00
|
|
|
static void intel_pmu_swap_task_ctx(struct perf_event_context *prev,
|
|
|
|
struct perf_event_context *next)
|
|
|
|
{
|
|
|
|
intel_pmu_lbr_swap_task_ctx(prev, next);
|
|
|
|
}
|
|
|
|
|
2019-02-04 20:35:32 +08:00
|
|
|
static int intel_pmu_check_period(struct perf_event *event, u64 value)
|
|
|
|
{
|
|
|
|
return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0;
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Support PEBS output to PT
If PEBS declares ability to output its data to Intel PT stream, use the
aux_output attribute bit to enable PEBS data output to PT. This requires
a PT event to be present and scheduled in the same context. Unlike the
DS area, the kernel does not extract PEBS records from the PT stream to
generate corresponding records in the perf stream, because that would
require real time in-kernel PT decoding, which is not feasible. The PMI,
however, can still be used.
The output setting is per-CPU, so all PEBS events must be either writing
to PT or to the DS area, therefore, in case of conflict, the conflicting
event will fail to schedule, allowing the rotation logic to alternate
between the PEBS->PT and PEBS->DS events.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: kan.liang@linux.intel.com
Link: https://lkml.kernel.org/r/20190806084606.4021-3-alexander.shishkin@linux.intel.com
2019-08-06 16:46:01 +08:00
|
|
|
static int intel_pmu_aux_output_match(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (!x86_pmu.intel_cap.pebs_output_pt_available)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return is_intel_pt_event(event);
|
|
|
|
}
|
|
|
|
|
2012-03-16 03:09:14 +08:00
|
|
|
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
|
|
|
|
|
2013-01-24 23:10:33 +08:00
|
|
|
PMU_FORMAT_ATTR(ldlat, "config1:0-15");
|
|
|
|
|
2015-09-10 05:53:59 +08:00
|
|
|
PMU_FORMAT_ATTR(frontend, "config1:0-23");
|
|
|
|
|
2012-03-16 03:09:14 +08:00
|
|
|
static struct attribute *intel_arch3_formats_attr[] = {
|
|
|
|
&format_attr_event.attr,
|
|
|
|
&format_attr_umask.attr,
|
|
|
|
&format_attr_edge.attr,
|
|
|
|
&format_attr_pc.attr,
|
|
|
|
&format_attr_any.attr,
|
|
|
|
&format_attr_inv.attr,
|
|
|
|
&format_attr_cmask.attr,
|
2017-08-23 02:52:00 +08:00
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct attribute *hsw_format_attr[] = {
|
2013-06-18 08:36:48 +08:00
|
|
|
&format_attr_in_tx.attr,
|
|
|
|
&format_attr_in_tx_cp.attr,
|
2017-08-23 02:52:00 +08:00
|
|
|
&format_attr_offcore_rsp.attr,
|
|
|
|
&format_attr_ldlat.attr,
|
|
|
|
NULL
|
|
|
|
};
|
2012-03-16 03:09:14 +08:00
|
|
|
|
2017-08-23 02:52:00 +08:00
|
|
|
static struct attribute *nhm_format_attr[] = {
|
|
|
|
&format_attr_offcore_rsp.attr,
|
|
|
|
&format_attr_ldlat.attr,
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct attribute *slm_format_attr[] = {
|
|
|
|
&format_attr_offcore_rsp.attr,
|
|
|
|
NULL
|
2012-03-16 03:09:14 +08:00
|
|
|
};
|
|
|
|
|
2015-09-10 05:53:59 +08:00
|
|
|
static struct attribute *skl_format_attr[] = {
|
|
|
|
&format_attr_frontend.attr,
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2015-04-21 23:26:23 +08:00
|
|
|
static __initconst const struct x86_pmu core_pmu = {
|
|
|
|
.name = "core",
|
|
|
|
.handle_irq = x86_pmu_handle_irq,
|
|
|
|
.disable_all = x86_pmu_disable_all,
|
|
|
|
.enable_all = core_pmu_enable_all,
|
|
|
|
.enable = core_pmu_enable_event,
|
|
|
|
.disable = x86_pmu_disable_event,
|
2018-11-21 18:16:10 +08:00
|
|
|
.hw_config = core_pmu_hw_config,
|
2015-04-21 23:26:23 +08:00
|
|
|
.schedule_events = x86_schedule_events,
|
|
|
|
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
|
|
|
|
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
|
|
|
|
.event_map = intel_pmu_event_map,
|
|
|
|
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
|
|
|
|
.apic = 1,
|
2018-03-12 22:45:37 +08:00
|
|
|
.large_pebs_flags = LARGE_PEBS_FLAGS,
|
2015-05-28 12:13:14 +08:00
|
|
|
|
2015-04-21 23:26:23 +08:00
|
|
|
/*
|
|
|
|
* Intel PMCs cannot be accessed sanely above 32-bit width,
|
|
|
|
* so we install an artificial 1<<31 period regardless of
|
|
|
|
* the generic event period:
|
|
|
|
*/
|
|
|
|
.max_period = (1ULL<<31) - 1,
|
|
|
|
.get_event_constraints = intel_get_event_constraints,
|
|
|
|
.put_event_constraints = intel_put_event_constraints,
|
|
|
|
.event_constraints = intel_core_event_constraints,
|
|
|
|
.guest_get_msrs = core_guest_get_msrs,
|
|
|
|
.format_attrs = intel_arch_formats_attr,
|
|
|
|
.events_sysfs_show = intel_event_sysfs_show,
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Virtual (or funny metal) CPU can define x86_pmu.extra_regs
|
|
|
|
* together with PMU version 1 and thus be using core_pmu with
|
|
|
|
* shared_regs. We need following callbacks here to allocate
|
|
|
|
* it properly.
|
|
|
|
*/
|
|
|
|
.cpu_prepare = intel_pmu_cpu_prepare,
|
|
|
|
.cpu_starting = intel_pmu_cpu_starting,
|
|
|
|
.cpu_dying = intel_pmu_cpu_dying,
|
2018-12-20 00:53:50 +08:00
|
|
|
.cpu_dead = intel_pmu_cpu_dead,
|
2019-02-04 20:35:32 +08:00
|
|
|
|
|
|
|
.check_period = intel_pmu_check_period,
|
2020-07-03 20:49:08 +08:00
|
|
|
|
|
|
|
.lbr_reset = intel_pmu_lbr_reset_64,
|
2020-07-03 20:49:09 +08:00
|
|
|
.lbr_read = intel_pmu_lbr_read_64,
|
2020-07-03 20:49:10 +08:00
|
|
|
.lbr_save = intel_pmu_lbr_save,
|
|
|
|
.lbr_restore = intel_pmu_lbr_restore,
|
2015-04-21 23:26:23 +08:00
|
|
|
};
|
|
|
|
|
2010-03-29 19:09:53 +08:00
|
|
|
static __initconst const struct x86_pmu intel_pmu = {
|
2010-02-26 19:05:05 +08:00
|
|
|
.name = "Intel",
|
|
|
|
.handle_irq = intel_pmu_handle_irq,
|
|
|
|
.disable_all = intel_pmu_disable_all,
|
|
|
|
.enable_all = intel_pmu_enable_all,
|
|
|
|
.enable = intel_pmu_enable_event,
|
|
|
|
.disable = intel_pmu_disable_event,
|
perf/x86: Ensure perf_sched_cb_{inc,dec}() is only called from pmu::{add,del}()
Currently perf_sched_cb_{inc,dec}() are called from
pmu::{start,stop}(), which has the problem that this can happen from
NMI context, this is making it hard to optimize perf_pmu_sched_task().
Furthermore, we really only need this accounting on pmu::{add,del}(),
so doing it from pmu::{start,stop}() is doing more work than we really
need.
Introduce x86_pmu::{add,del}() and wire up the LBR and PEBS.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-07 00:02:43 +08:00
|
|
|
.add = intel_pmu_add_event,
|
|
|
|
.del = intel_pmu_del_event,
|
2018-02-13 06:20:34 +08:00
|
|
|
.read = intel_pmu_read_event,
|
2010-03-30 23:00:06 +08:00
|
|
|
.hw_config = intel_pmu_hw_config,
|
2010-03-12 00:54:39 +08:00
|
|
|
.schedule_events = x86_schedule_events,
|
2010-02-26 19:05:05 +08:00
|
|
|
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
|
|
|
|
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
|
|
|
|
.event_map = intel_pmu_event_map,
|
|
|
|
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
|
|
|
|
.apic = 1,
|
2018-03-12 22:45:37 +08:00
|
|
|
.large_pebs_flags = LARGE_PEBS_FLAGS,
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
|
|
|
* Intel PMCs cannot be accessed sanely above 32 bit width,
|
|
|
|
* so we install an artificial 1<<31 period regardless of
|
|
|
|
* the generic event period:
|
|
|
|
*/
|
|
|
|
.max_period = (1ULL << 31) - 1,
|
2010-03-05 20:01:18 +08:00
|
|
|
.get_event_constraints = intel_get_event_constraints,
|
2011-03-03 10:34:47 +08:00
|
|
|
.put_event_constraints = intel_put_event_constraints,
|
2012-06-05 16:26:43 +08:00
|
|
|
.pebs_aliases = intel_pebs_aliases_core2,
|
2010-03-05 20:01:18 +08:00
|
|
|
|
2012-03-16 03:09:14 +08:00
|
|
|
.format_attrs = intel_arch3_formats_attr,
|
2012-10-10 20:53:14 +08:00
|
|
|
.events_sysfs_show = intel_event_sysfs_show,
|
2012-03-16 03:09:14 +08:00
|
|
|
|
2011-03-03 10:34:47 +08:00
|
|
|
.cpu_prepare = intel_pmu_cpu_prepare,
|
2010-03-05 20:49:35 +08:00
|
|
|
.cpu_starting = intel_pmu_cpu_starting,
|
|
|
|
.cpu_dying = intel_pmu_cpu_dying,
|
2018-12-20 00:53:50 +08:00
|
|
|
.cpu_dead = intel_pmu_cpu_dead,
|
|
|
|
|
2011-10-05 20:01:21 +08:00
|
|
|
.guest_get_msrs = intel_guest_get_msrs,
|
2015-05-07 03:33:51 +08:00
|
|
|
.sched_task = intel_pmu_sched_task,
|
2019-10-23 15:13:56 +08:00
|
|
|
.swap_task_ctx = intel_pmu_swap_task_ctx,
|
2019-02-04 20:35:32 +08:00
|
|
|
|
|
|
|
.check_period = intel_pmu_check_period,
|
perf/x86/intel: Support PEBS output to PT
If PEBS declares ability to output its data to Intel PT stream, use the
aux_output attribute bit to enable PEBS data output to PT. This requires
a PT event to be present and scheduled in the same context. Unlike the
DS area, the kernel does not extract PEBS records from the PT stream to
generate corresponding records in the perf stream, because that would
require real time in-kernel PT decoding, which is not feasible. The PMI,
however, can still be used.
The output setting is per-CPU, so all PEBS events must be either writing
to PT or to the DS area, therefore, in case of conflict, the conflicting
event will fail to schedule, allowing the rotation logic to alternate
between the PEBS->PT and PEBS->DS events.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: kan.liang@linux.intel.com
Link: https://lkml.kernel.org/r/20190806084606.4021-3-alexander.shishkin@linux.intel.com
2019-08-06 16:46:01 +08:00
|
|
|
|
|
|
|
.aux_output_match = intel_pmu_aux_output_match,
|
2020-07-03 20:49:08 +08:00
|
|
|
|
|
|
|
.lbr_reset = intel_pmu_lbr_reset_64,
|
2020-07-03 20:49:09 +08:00
|
|
|
.lbr_read = intel_pmu_lbr_read_64,
|
2020-07-03 20:49:10 +08:00
|
|
|
.lbr_save = intel_pmu_lbr_save,
|
|
|
|
.lbr_restore = intel_pmu_lbr_restore,
|
2010-02-26 19:05:05 +08:00
|
|
|
};
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
static __init void intel_clovertown_quirk(void)
|
2010-03-05 04:49:01 +08:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* PEBS is unreliable due to:
|
|
|
|
*
|
|
|
|
* AJ67 - PEBS may experience CPL leaks
|
|
|
|
* AJ68 - PEBS PMI may be delayed by one event
|
|
|
|
* AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
|
|
|
|
* AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
|
|
|
|
*
|
|
|
|
* AJ67 could be worked around by restricting the OS/USR flags.
|
|
|
|
* AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
|
|
|
|
*
|
|
|
|
* AJ106 could possibly be worked around by not allowing LBR
|
|
|
|
* usage from PEBS, including the fixup.
|
|
|
|
* AJ68 could possibly be worked around by always programming
|
2011-04-27 17:51:41 +08:00
|
|
|
* a pebs_event_reset[0] value and coping with the lost events.
|
2010-03-05 04:49:01 +08:00
|
|
|
*
|
|
|
|
* But taken together it might just make sense to not enable PEBS on
|
|
|
|
* these chips.
|
|
|
|
*/
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_warn("PEBS disabled due to CPU errata\n");
|
2010-03-05 04:49:01 +08:00
|
|
|
x86_pmu.pebs = 0;
|
|
|
|
x86_pmu.pebs_constraints = NULL;
|
|
|
|
}
|
|
|
|
|
2019-02-05 06:23:30 +08:00
|
|
|
static const struct x86_cpu_desc isolation_ucodes[] = {
|
2019-08-28 03:48:21 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_HASWELL, 3, 0x0000001f),
|
2019-08-28 03:48:22 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_HASWELL_L, 1, 0x0000001e),
|
2019-08-28 03:48:23 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_HASWELL_G, 1, 0x00000015),
|
2019-02-05 06:23:30 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_HASWELL_X, 2, 0x00000037),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_HASWELL_X, 4, 0x0000000a),
|
2019-08-28 03:48:21 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL, 4, 0x00000023),
|
2019-08-28 03:48:23 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_G, 1, 0x00000014),
|
2019-08-28 03:48:24 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 2, 0x00000010),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 3, 0x07000009),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 4, 0x0f000009),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 5, 0x0e000002),
|
2019-02-05 06:23:30 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000),
|
2021-02-06 03:13:24 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 6, 0x00000000),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 7, 0x00000000),
|
2019-08-28 03:48:22 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L, 3, 0x0000007c),
|
2019-08-28 03:48:21 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE, 3, 0x0000007c),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 9, 0x0000004e),
|
2019-08-28 03:48:22 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 9, 0x0000004e),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 10, 0x0000004e),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 11, 0x0000004e),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 12, 0x0000004e),
|
2019-08-28 03:48:21 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 10, 0x0000004e),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 11, 0x0000004e),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 12, 0x0000004e),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 13, 0x0000004e),
|
2019-02-05 06:23:30 +08:00
|
|
|
{}
|
|
|
|
};
|
|
|
|
|
|
|
|
static void intel_check_pebs_isolation(void)
|
|
|
|
{
|
|
|
|
x86_pmu.pebs_no_isolation = !x86_cpu_has_min_microcode_rev(isolation_ucodes);
|
|
|
|
}
|
|
|
|
|
|
|
|
static __init void intel_pebs_isolation_quirk(void)
|
|
|
|
{
|
|
|
|
WARN_ON_ONCE(x86_pmu.check_microcode);
|
|
|
|
x86_pmu.check_microcode = intel_check_pebs_isolation;
|
|
|
|
intel_check_pebs_isolation();
|
|
|
|
}
|
|
|
|
|
2019-02-05 06:23:31 +08:00
|
|
|
static const struct x86_cpu_desc pebs_ucodes[] = {
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SANDYBRIDGE, 7, 0x00000028),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SANDYBRIDGE_X, 6, 0x00000618),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SANDYBRIDGE_X, 7, 0x0000070c),
|
|
|
|
{}
|
|
|
|
};
|
2012-06-08 20:50:50 +08:00
|
|
|
|
2019-02-05 06:23:31 +08:00
|
|
|
static bool intel_snb_pebs_broken(void)
|
|
|
|
{
|
|
|
|
return !x86_cpu_has_min_microcode_rev(pebs_ucodes);
|
2012-06-08 20:50:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_snb_check_microcode(void)
|
|
|
|
{
|
2019-02-05 06:23:31 +08:00
|
|
|
if (intel_snb_pebs_broken() == x86_pmu.pebs_broken)
|
2012-06-08 20:50:50 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Serialized by the microcode lock..
|
|
|
|
*/
|
|
|
|
if (x86_pmu.pebs_broken) {
|
|
|
|
pr_info("PEBS enabled due to microcode update\n");
|
|
|
|
x86_pmu.pebs_broken = 0;
|
|
|
|
} else {
|
|
|
|
pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
|
|
|
|
x86_pmu.pebs_broken = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-06-22 02:31:11 +08:00
|
|
|
static bool is_lbr_from(unsigned long msr)
|
|
|
|
{
|
|
|
|
unsigned long lbr_from_nr = x86_pmu.lbr_from + x86_pmu.lbr_nr;
|
|
|
|
|
|
|
|
return x86_pmu.lbr_from <= msr && msr < lbr_from_nr;
|
|
|
|
}
|
|
|
|
|
2014-07-15 03:25:56 +08:00
|
|
|
/*
|
|
|
|
* Under certain circumstances, access certain MSR may cause #GP.
|
|
|
|
* The function tests if the input MSR can be safely accessed.
|
|
|
|
*/
|
|
|
|
static bool check_msr(unsigned long msr, u64 mask)
|
|
|
|
{
|
|
|
|
u64 val_old, val_new, val_tmp;
|
|
|
|
|
2019-06-16 22:13:13 +08:00
|
|
|
/*
|
|
|
|
* Disable the check for real HW, so we don't
|
|
|
|
* mess with potentionaly enabled registers:
|
|
|
|
*/
|
2019-07-25 10:39:26 +08:00
|
|
|
if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
|
2019-06-16 22:13:13 +08:00
|
|
|
return true;
|
|
|
|
|
2014-07-15 03:25:56 +08:00
|
|
|
/*
|
|
|
|
* Read the current value, change it and read it back to see if it
|
|
|
|
* matches, this is needed to detect certain hardware emulators
|
|
|
|
* (qemu/kvm) that don't trap on the MSR access and always return 0s.
|
|
|
|
*/
|
|
|
|
if (rdmsrl_safe(msr, &val_old))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only change the bits which can be updated by wrmsrl.
|
|
|
|
*/
|
|
|
|
val_tmp = val_old ^ mask;
|
2016-06-22 02:31:11 +08:00
|
|
|
|
|
|
|
if (is_lbr_from(msr))
|
|
|
|
val_tmp = lbr_from_signext_quirk_wr(val_tmp);
|
|
|
|
|
2014-07-15 03:25:56 +08:00
|
|
|
if (wrmsrl_safe(msr, val_tmp) ||
|
|
|
|
rdmsrl_safe(msr, &val_new))
|
|
|
|
return false;
|
|
|
|
|
2016-06-22 02:31:11 +08:00
|
|
|
/*
|
|
|
|
* Quirk only affects validation in wrmsr(), so wrmsrl()'s value
|
|
|
|
* should equal rdmsrl()'s even with the quirk.
|
|
|
|
*/
|
2014-07-15 03:25:56 +08:00
|
|
|
if (val_new != val_tmp)
|
|
|
|
return false;
|
|
|
|
|
2016-06-22 02:31:11 +08:00
|
|
|
if (is_lbr_from(msr))
|
|
|
|
val_old = lbr_from_signext_quirk_wr(val_old);
|
|
|
|
|
2014-07-15 03:25:56 +08:00
|
|
|
/* Here it's sure that the MSR can be safely accessed.
|
|
|
|
* Restore the old value and return.
|
|
|
|
*/
|
|
|
|
wrmsrl(msr, val_old);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
static __init void intel_sandybridge_quirk(void)
|
2011-11-15 17:51:15 +08:00
|
|
|
{
|
2012-06-08 20:50:50 +08:00
|
|
|
x86_pmu.check_microcode = intel_snb_check_microcode;
|
2017-05-24 16:15:30 +08:00
|
|
|
cpus_read_lock();
|
2012-06-08 20:50:50 +08:00
|
|
|
intel_snb_check_microcode();
|
2017-05-24 16:15:30 +08:00
|
|
|
cpus_read_unlock();
|
2011-11-15 17:51:15 +08:00
|
|
|
}
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
|
|
|
|
{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
|
|
|
|
{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
|
|
|
|
{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
|
|
|
|
{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
|
|
|
|
{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
|
|
|
|
{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
|
|
|
|
{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
|
2011-11-10 20:57:26 +08:00
|
|
|
};
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
static __init void intel_arch_events_quirk(void)
|
|
|
|
{
|
|
|
|
int bit;
|
|
|
|
|
|
|
|
/* disable event that reported as not presend by cpuid */
|
|
|
|
for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
|
|
|
|
intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_warn("CPUID marked event: \'%s\' unavailable\n",
|
|
|
|
intel_arch_events_map[bit].name);
|
2011-12-06 21:07:15 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static __init void intel_nehalem_quirk(void)
|
|
|
|
{
|
|
|
|
union cpuid10_ebx ebx;
|
|
|
|
|
|
|
|
ebx.full = x86_pmu.events_maskl;
|
|
|
|
if (ebx.split.no_branch_misses_retired) {
|
|
|
|
/*
|
|
|
|
* Erratum AAJ80 detected, we work it around by using
|
|
|
|
* the BR_MISP_EXEC.ANY event. This will over-count
|
|
|
|
* branch-misses, but it's still much better than the
|
|
|
|
* architectural event which is often completely bogus:
|
|
|
|
*/
|
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
|
|
|
|
ebx.split.no_branch_misses_retired = 0;
|
|
|
|
x86_pmu.events_maskl = ebx.full;
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_info("CPU erratum AAJ80 worked around\n");
|
2011-12-06 21:07:15 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-18 03:06:59 +08:00
|
|
|
/*
|
|
|
|
* enable software workaround for errata:
|
|
|
|
* SNB: BJ122
|
|
|
|
* IVB: BV98
|
|
|
|
* HSW: HSD29
|
|
|
|
*
|
|
|
|
* Only needed when HT is enabled. However detecting
|
2014-11-18 03:07:04 +08:00
|
|
|
* if HT is enabled is difficult (model specific). So instead,
|
|
|
|
* we enable the workaround in the early boot, and verify if
|
|
|
|
* it is needed in a later initcall phase once we have valid
|
|
|
|
* topology information to check if HT is actually enabled
|
2014-11-18 03:06:59 +08:00
|
|
|
*/
|
|
|
|
static __init void intel_ht_bug(void)
|
|
|
|
{
|
2014-11-18 03:07:04 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
|
2014-11-18 03:06:59 +08:00
|
|
|
|
|
|
|
x86_pmu.start_scheduling = intel_start_scheduling;
|
2015-05-21 16:57:32 +08:00
|
|
|
x86_pmu.commit_scheduling = intel_commit_scheduling;
|
2014-11-18 03:06:59 +08:00
|
|
|
x86_pmu.stop_scheduling = intel_stop_scheduling;
|
|
|
|
}
|
|
|
|
|
2013-09-13 01:17:00 +08:00
|
|
|
EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
|
|
|
|
EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")
|
2013-06-18 08:36:52 +08:00
|
|
|
|
2013-09-06 11:37:40 +08:00
|
|
|
/* Haswell special events */
|
2013-09-13 01:17:00 +08:00
|
|
|
EVENT_ATTR_STR(tx-start, tx_start, "event=0xc9,umask=0x1");
|
|
|
|
EVENT_ATTR_STR(tx-commit, tx_commit, "event=0xc9,umask=0x2");
|
|
|
|
EVENT_ATTR_STR(tx-abort, tx_abort, "event=0xc9,umask=0x4");
|
|
|
|
EVENT_ATTR_STR(tx-capacity, tx_capacity, "event=0x54,umask=0x2");
|
|
|
|
EVENT_ATTR_STR(tx-conflict, tx_conflict, "event=0x54,umask=0x1");
|
|
|
|
EVENT_ATTR_STR(el-start, el_start, "event=0xc8,umask=0x1");
|
|
|
|
EVENT_ATTR_STR(el-commit, el_commit, "event=0xc8,umask=0x2");
|
|
|
|
EVENT_ATTR_STR(el-abort, el_abort, "event=0xc8,umask=0x4");
|
|
|
|
EVENT_ATTR_STR(el-capacity, el_capacity, "event=0x54,umask=0x2");
|
|
|
|
EVENT_ATTR_STR(el-conflict, el_conflict, "event=0x54,umask=0x1");
|
|
|
|
EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1");
|
|
|
|
EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1");
|
2013-09-06 11:37:40 +08:00
|
|
|
|
2013-06-18 08:36:52 +08:00
|
|
|
static struct attribute *hsw_events_attrs[] = {
|
2017-11-09 08:07:18 +08:00
|
|
|
EVENT_PTR(td_slots_issued),
|
|
|
|
EVENT_PTR(td_slots_retired),
|
|
|
|
EVENT_PTR(td_fetch_bubbles),
|
|
|
|
EVENT_PTR(td_total_slots),
|
|
|
|
EVENT_PTR(td_total_slots_scale),
|
|
|
|
EVENT_PTR(td_recovery_bubbles),
|
|
|
|
EVENT_PTR(td_recovery_bubbles_scale),
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2018-09-06 21:57:48 +08:00
|
|
|
static struct attribute *hsw_mem_events_attrs[] = {
|
|
|
|
EVENT_PTR(mem_ld_hsw),
|
|
|
|
EVENT_PTR(mem_st_hsw),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2017-11-09 08:07:18 +08:00
|
|
|
static struct attribute *hsw_tsx_events_attrs[] = {
|
2013-09-06 11:37:40 +08:00
|
|
|
EVENT_PTR(tx_start),
|
|
|
|
EVENT_PTR(tx_commit),
|
|
|
|
EVENT_PTR(tx_abort),
|
|
|
|
EVENT_PTR(tx_capacity),
|
|
|
|
EVENT_PTR(tx_conflict),
|
|
|
|
EVENT_PTR(el_start),
|
|
|
|
EVENT_PTR(el_commit),
|
|
|
|
EVENT_PTR(el_abort),
|
|
|
|
EVENT_PTR(el_capacity),
|
|
|
|
EVENT_PTR(el_conflict),
|
|
|
|
EVENT_PTR(cycles_t),
|
|
|
|
EVENT_PTR(cycles_ct),
|
2013-06-18 08:36:52 +08:00
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2019-04-03 03:45:05 +08:00
|
|
|
EVENT_ATTR_STR(tx-capacity-read, tx_capacity_read, "event=0x54,umask=0x80");
|
|
|
|
EVENT_ATTR_STR(tx-capacity-write, tx_capacity_write, "event=0x54,umask=0x2");
|
|
|
|
EVENT_ATTR_STR(el-capacity-read, el_capacity_read, "event=0x54,umask=0x80");
|
|
|
|
EVENT_ATTR_STR(el-capacity-write, el_capacity_write, "event=0x54,umask=0x2");
|
|
|
|
|
|
|
|
static struct attribute *icl_events_attrs[] = {
|
|
|
|
EVENT_PTR(mem_ld_hsw),
|
|
|
|
EVENT_PTR(mem_st_hsw),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2020-07-24 01:11:13 +08:00
|
|
|
static struct attribute *icl_td_events_attrs[] = {
|
|
|
|
EVENT_PTR(slots),
|
|
|
|
EVENT_PTR(td_retiring),
|
|
|
|
EVENT_PTR(td_bad_spec),
|
|
|
|
EVENT_PTR(td_fe_bound),
|
|
|
|
EVENT_PTR(td_be_bound),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2019-04-03 03:45:05 +08:00
|
|
|
static struct attribute *icl_tsx_events_attrs[] = {
|
|
|
|
EVENT_PTR(tx_start),
|
|
|
|
EVENT_PTR(tx_abort),
|
|
|
|
EVENT_PTR(tx_commit),
|
|
|
|
EVENT_PTR(tx_capacity_read),
|
|
|
|
EVENT_PTR(tx_capacity_write),
|
|
|
|
EVENT_PTR(tx_conflict),
|
|
|
|
EVENT_PTR(el_start),
|
|
|
|
EVENT_PTR(el_abort),
|
|
|
|
EVENT_PTR(el_commit),
|
|
|
|
EVENT_PTR(el_capacity_read),
|
|
|
|
EVENT_PTR(el_capacity_write),
|
|
|
|
EVENT_PTR(el_conflict),
|
|
|
|
EVENT_PTR(cycles_t),
|
|
|
|
EVENT_PTR(cycles_ct),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
perf/x86/intel: Add perf core PMU support for Sapphire Rapids
Add perf core PMU support for the Intel Sapphire Rapids server, which is
the successor of the Intel Ice Lake server. The enabling code is based
on Ice Lake, but there are several new features introduced.
The event encoding is changed and simplified, e.g., the event codes
which are below 0x90 are restricted to counters 0-3. The event codes
which above 0x90 are likely to have no restrictions. The event
constraints, extra_regs(), and hardware cache events table are changed
accordingly.
A new Precise Distribution (PDist) facility is introduced, which
further minimizes the skid when a precise event is programmed on the GP
counter 0. Enable the Precise Distribution (PDist) facility with :ppp
event. For this facility to work, the period must be initialized with a
value larger than 127. Add spr_limit_period() to apply the limit for
:ppp event.
Two new data source fields, data block & address block, are added in the
PEBS Memory Info Record for the load latency event. To enable the
feature,
- An auxiliary event has to be enabled together with the load latency
event on Sapphire Rapids. A new flag PMU_FL_MEM_LOADS_AUX is
introduced to indicate the case. A new event, mem-loads-aux, is
exposed to sysfs for the user tool.
Add a check in hw_config(). If the auxiliary event is not detected,
return an unique error -ENODATA.
- The union perf_mem_data_src is extended to support the new fields.
- Ice Lake and earlier models do not support block information, but the
fields may be set by HW on some machines. Add pebs_no_block to
explicitly indicate the previous platforms which don't support the new
block fields. Accessing the new block fields are ignored on those
platforms.
A new store Latency facility is introduced, which leverages the PEBS
facility where it can provide additional information about sampled
stores. The additional information includes the data address, memory
auxiliary info (e.g. Data Source, STLB miss) and the latency of the
store access. To enable the facility, the new event (0x02cd) has to be
programed on the GP counter 0. A new flag PERF_X86_EVENT_PEBS_STLAT is
introduced to indicate the event. The store_latency_data() is introduced
to parse the memory auxiliary info.
The layout of access latency field of PEBS Memory Info Record has been
changed. Two latency, instruction latency (bit 15:0) and cache access
latency (bit 47:32) are recorded.
- The cache access latency is similar to previous memory access latency.
For loads, the latency starts by the actual cache access until the
data is returned by the memory subsystem.
For stores, the latency starts when the demand write accesses the L1
data cache and lasts until the cacheline write is completed in the
memory subsystem.
The cache access latency is stored in low 32bits of the sample type
PERF_SAMPLE_WEIGHT_STRUCT.
- The instruction latency starts by the dispatch of the load operation
for execution and lasts until completion of the instruction it belongs
to.
Add a new flag PMU_FL_INSTR_LATENCY to indicate the instruction
latency support. The instruction latency is stored in the bit 47:32
of the sample type PERF_SAMPLE_WEIGHT_STRUCT.
Extends the PERF_METRICS MSR to feature TMA method level 2 metrics. The
lower half of the register is the TMA level 1 metrics (legacy). The
upper half is also divided into four 8-bit fields for the new level 2
metrics. Expose all eight Topdown metrics events to user space.
The full description for the SPR features can be found at Intel
Architecture Instruction Set Extensions and Future Features
Programming Reference, 319433-041.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-5-git-send-email-kan.liang@linux.intel.com
2021-01-29 06:40:10 +08:00
|
|
|
|
|
|
|
EVENT_ATTR_STR(mem-stores, mem_st_spr, "event=0xcd,umask=0x2");
|
|
|
|
EVENT_ATTR_STR(mem-loads-aux, mem_ld_aux, "event=0x03,umask=0x82");
|
|
|
|
|
|
|
|
static struct attribute *spr_events_attrs[] = {
|
|
|
|
EVENT_PTR(mem_ld_hsw),
|
|
|
|
EVENT_PTR(mem_st_spr),
|
|
|
|
EVENT_PTR(mem_ld_aux),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct attribute *spr_td_events_attrs[] = {
|
|
|
|
EVENT_PTR(slots),
|
|
|
|
EVENT_PTR(td_retiring),
|
|
|
|
EVENT_PTR(td_bad_spec),
|
|
|
|
EVENT_PTR(td_fe_bound),
|
|
|
|
EVENT_PTR(td_be_bound),
|
|
|
|
EVENT_PTR(td_heavy_ops),
|
|
|
|
EVENT_PTR(td_br_mispredict),
|
|
|
|
EVENT_PTR(td_fetch_lat),
|
|
|
|
EVENT_PTR(td_mem_bound),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct attribute *spr_tsx_events_attrs[] = {
|
|
|
|
EVENT_PTR(tx_start),
|
|
|
|
EVENT_PTR(tx_abort),
|
|
|
|
EVENT_PTR(tx_commit),
|
|
|
|
EVENT_PTR(tx_capacity_read),
|
|
|
|
EVENT_PTR(tx_capacity_write),
|
|
|
|
EVENT_PTR(tx_conflict),
|
|
|
|
EVENT_PTR(cycles_t),
|
|
|
|
EVENT_PTR(cycles_ct),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2017-05-12 22:51:13 +08:00
|
|
|
static ssize_t freeze_on_smi_show(struct device *cdev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
return sprintf(buf, "%lu\n", x86_pmu.attr_freeze_on_smi);
|
|
|
|
}
|
|
|
|
|
|
|
|
static DEFINE_MUTEX(freeze_on_smi_mutex);
|
|
|
|
|
|
|
|
static ssize_t freeze_on_smi_store(struct device *cdev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
unsigned long val;
|
|
|
|
ssize_t ret;
|
|
|
|
|
|
|
|
ret = kstrtoul(buf, 0, &val);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (val > 1)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
mutex_lock(&freeze_on_smi_mutex);
|
|
|
|
|
|
|
|
if (x86_pmu.attr_freeze_on_smi == val)
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
x86_pmu.attr_freeze_on_smi = val;
|
|
|
|
|
|
|
|
get_online_cpus();
|
|
|
|
on_each_cpu(flip_smm_bit, &val, 1);
|
|
|
|
put_online_cpus();
|
|
|
|
done:
|
|
|
|
mutex_unlock(&freeze_on_smi_mutex);
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Force resched when TFA sysctl is modified
This patch provides guarantee to the sysadmin that when TFA is disabled, no PMU
event is using PMC3 when the echo command returns. Vice-Versa, when TFA
is enabled, PMU can use PMC3 immediately (to eliminate possible multiplexing).
$ perf stat -a -I 1000 --no-merge -e branches,branches,branches,branches
1.000123979 125,768,725,208 branches
1.000562520 125,631,000,456 branches
1.000942898 125,487,114,291 branches
1.001333316 125,323,363,620 branches
2.004721306 125,514,968,546 branches
2.005114560 125,511,110,861 branches
2.005482722 125,510,132,724 branches
2.005851245 125,508,967,086 branches
3.006323475 125,166,570,648 branches
3.006709247 125,165,650,056 branches
3.007086605 125,164,639,142 branches
3.007459298 125,164,402,912 branches
4.007922698 125,045,577,140 branches
4.008310775 125,046,804,324 branches
4.008670814 125,048,265,111 branches
4.009039251 125,048,677,611 branches
5.009503373 125,122,240,217 branches
5.009897067 125,122,450,517 branches
Then on another connection, sysadmin does:
$ echo 1 >/sys/devices/cpu/allow_tsx_force_abort
Then perf stat adjusts the events immediately:
5.010286029 125,121,393,483 branches
5.010646308 125,120,556,786 branches
6.011113588 124,963,351,832 branches
6.011510331 124,964,267,566 branches
6.011889913 124,964,829,130 branches
6.012262996 124,965,841,156 branches
7.012708299 124,419,832,234 branches [79.69%]
7.012847908 124,416,363,853 branches [79.73%]
7.013225462 124,400,723,712 branches [79.73%]
7.013598191 124,376,154,434 branches [79.70%]
8.014089834 124,250,862,693 branches [74.98%]
8.014481363 124,267,539,139 branches [74.94%]
8.014856006 124,259,519,786 branches [74.98%]
8.014980848 124,225,457,969 branches [75.04%]
9.015464576 124,204,235,423 branches [75.03%]
9.015858587 124,204,988,490 branches [75.04%]
9.016243680 124,220,092,486 branches [74.99%]
9.016620104 124,231,260,146 branches [74.94%]
And vice-versa if the syadmin does:
$ echo 0 >/sys/devices/cpu/allow_tsx_force_abort
Events are again spread over the 4 counters:
10.017096277 124,276,230,565 branches [74.96%]
10.017237209 124,228,062,171 branches [75.03%]
10.017478637 124,178,780,626 branches [75.03%]
10.017853402 124,198,316,177 branches [75.03%]
11.018334423 124,602,418,933 branches [85.40%]
11.018722584 124,602,921,320 branches [85.42%]
11.019095621 124,603,956,093 branches [85.42%]
11.019467742 124,595,273,783 branches [85.42%]
12.019945736 125,110,114,864 branches
12.020330764 125,109,334,472 branches
12.020688740 125,109,818,865 branches
12.021054020 125,108,594,014 branches
13.021516774 125,109,164,018 branches
13.021903640 125,108,794,510 branches
13.022270770 125,107,756,978 branches
13.022630819 125,109,380,471 branches
14.023114989 125,133,140,817 branches
14.023501880 125,133,785,858 branches
14.023868339 125,133,852,700 branches
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: kan.liang@intel.com
Cc: nelson.dsouza@intel.com
Cc: tonyj@suse.com
Link: https://lkml.kernel.org/r/20190408173252.37932-3-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-04-09 01:32:52 +08:00
|
|
|
static void update_tfa_sched(void *ignored)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* check if PMC3 is used
|
|
|
|
* and if so force schedule out for all event types all contexts
|
|
|
|
*/
|
|
|
|
if (test_bit(3, cpuc->active_mask))
|
2021-04-12 22:30:43 +08:00
|
|
|
perf_pmu_resched(x86_get_pmu(smp_processor_id()));
|
perf/x86/intel: Force resched when TFA sysctl is modified
This patch provides guarantee to the sysadmin that when TFA is disabled, no PMU
event is using PMC3 when the echo command returns. Vice-Versa, when TFA
is enabled, PMU can use PMC3 immediately (to eliminate possible multiplexing).
$ perf stat -a -I 1000 --no-merge -e branches,branches,branches,branches
1.000123979 125,768,725,208 branches
1.000562520 125,631,000,456 branches
1.000942898 125,487,114,291 branches
1.001333316 125,323,363,620 branches
2.004721306 125,514,968,546 branches
2.005114560 125,511,110,861 branches
2.005482722 125,510,132,724 branches
2.005851245 125,508,967,086 branches
3.006323475 125,166,570,648 branches
3.006709247 125,165,650,056 branches
3.007086605 125,164,639,142 branches
3.007459298 125,164,402,912 branches
4.007922698 125,045,577,140 branches
4.008310775 125,046,804,324 branches
4.008670814 125,048,265,111 branches
4.009039251 125,048,677,611 branches
5.009503373 125,122,240,217 branches
5.009897067 125,122,450,517 branches
Then on another connection, sysadmin does:
$ echo 1 >/sys/devices/cpu/allow_tsx_force_abort
Then perf stat adjusts the events immediately:
5.010286029 125,121,393,483 branches
5.010646308 125,120,556,786 branches
6.011113588 124,963,351,832 branches
6.011510331 124,964,267,566 branches
6.011889913 124,964,829,130 branches
6.012262996 124,965,841,156 branches
7.012708299 124,419,832,234 branches [79.69%]
7.012847908 124,416,363,853 branches [79.73%]
7.013225462 124,400,723,712 branches [79.73%]
7.013598191 124,376,154,434 branches [79.70%]
8.014089834 124,250,862,693 branches [74.98%]
8.014481363 124,267,539,139 branches [74.94%]
8.014856006 124,259,519,786 branches [74.98%]
8.014980848 124,225,457,969 branches [75.04%]
9.015464576 124,204,235,423 branches [75.03%]
9.015858587 124,204,988,490 branches [75.04%]
9.016243680 124,220,092,486 branches [74.99%]
9.016620104 124,231,260,146 branches [74.94%]
And vice-versa if the syadmin does:
$ echo 0 >/sys/devices/cpu/allow_tsx_force_abort
Events are again spread over the 4 counters:
10.017096277 124,276,230,565 branches [74.96%]
10.017237209 124,228,062,171 branches [75.03%]
10.017478637 124,178,780,626 branches [75.03%]
10.017853402 124,198,316,177 branches [75.03%]
11.018334423 124,602,418,933 branches [85.40%]
11.018722584 124,602,921,320 branches [85.42%]
11.019095621 124,603,956,093 branches [85.42%]
11.019467742 124,595,273,783 branches [85.42%]
12.019945736 125,110,114,864 branches
12.020330764 125,109,334,472 branches
12.020688740 125,109,818,865 branches
12.021054020 125,108,594,014 branches
13.021516774 125,109,164,018 branches
13.021903640 125,108,794,510 branches
13.022270770 125,107,756,978 branches
13.022630819 125,109,380,471 branches
14.023114989 125,133,140,817 branches
14.023501880 125,133,785,858 branches
14.023868339 125,133,852,700 branches
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: kan.liang@intel.com
Cc: nelson.dsouza@intel.com
Cc: tonyj@suse.com
Link: https://lkml.kernel.org/r/20190408173252.37932-3-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-04-09 01:32:52 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t show_sysctl_tfa(struct device *cdev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
return snprintf(buf, 40, "%d\n", allow_tsx_force_abort);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t set_sysctl_tfa(struct device *cdev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
bool val;
|
|
|
|
ssize_t ret;
|
|
|
|
|
|
|
|
ret = kstrtobool(buf, &val);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
/* no change */
|
|
|
|
if (val == allow_tsx_force_abort)
|
|
|
|
return count;
|
|
|
|
|
|
|
|
allow_tsx_force_abort = val;
|
|
|
|
|
|
|
|
get_online_cpus();
|
|
|
|
on_each_cpu(update_tfa_sched, NULL, 1);
|
|
|
|
put_online_cpus();
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-05-12 22:51:13 +08:00
|
|
|
static DEVICE_ATTR_RW(freeze_on_smi);
|
|
|
|
|
2017-08-23 02:52:01 +08:00
|
|
|
static ssize_t branches_show(struct device *cdev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static DEVICE_ATTR_RO(branches);
|
|
|
|
|
|
|
|
static struct attribute *lbr_attrs[] = {
|
|
|
|
&dev_attr_branches.attr,
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
|
|
|
static char pmu_name_str[30];
|
|
|
|
|
|
|
|
static ssize_t pmu_name_show(struct device *cdev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
return snprintf(buf, PAGE_SIZE, "%s\n", pmu_name_str);
|
|
|
|
}
|
|
|
|
|
|
|
|
static DEVICE_ATTR_RO(pmu_name);
|
|
|
|
|
|
|
|
static struct attribute *intel_pmu_caps_attrs[] = {
|
2017-08-28 18:46:50 +08:00
|
|
|
&dev_attr_pmu_name.attr,
|
|
|
|
NULL
|
2017-08-23 02:52:01 +08:00
|
|
|
};
|
|
|
|
|
perf/x86/intel: Force resched when TFA sysctl is modified
This patch provides guarantee to the sysadmin that when TFA is disabled, no PMU
event is using PMC3 when the echo command returns. Vice-Versa, when TFA
is enabled, PMU can use PMC3 immediately (to eliminate possible multiplexing).
$ perf stat -a -I 1000 --no-merge -e branches,branches,branches,branches
1.000123979 125,768,725,208 branches
1.000562520 125,631,000,456 branches
1.000942898 125,487,114,291 branches
1.001333316 125,323,363,620 branches
2.004721306 125,514,968,546 branches
2.005114560 125,511,110,861 branches
2.005482722 125,510,132,724 branches
2.005851245 125,508,967,086 branches
3.006323475 125,166,570,648 branches
3.006709247 125,165,650,056 branches
3.007086605 125,164,639,142 branches
3.007459298 125,164,402,912 branches
4.007922698 125,045,577,140 branches
4.008310775 125,046,804,324 branches
4.008670814 125,048,265,111 branches
4.009039251 125,048,677,611 branches
5.009503373 125,122,240,217 branches
5.009897067 125,122,450,517 branches
Then on another connection, sysadmin does:
$ echo 1 >/sys/devices/cpu/allow_tsx_force_abort
Then perf stat adjusts the events immediately:
5.010286029 125,121,393,483 branches
5.010646308 125,120,556,786 branches
6.011113588 124,963,351,832 branches
6.011510331 124,964,267,566 branches
6.011889913 124,964,829,130 branches
6.012262996 124,965,841,156 branches
7.012708299 124,419,832,234 branches [79.69%]
7.012847908 124,416,363,853 branches [79.73%]
7.013225462 124,400,723,712 branches [79.73%]
7.013598191 124,376,154,434 branches [79.70%]
8.014089834 124,250,862,693 branches [74.98%]
8.014481363 124,267,539,139 branches [74.94%]
8.014856006 124,259,519,786 branches [74.98%]
8.014980848 124,225,457,969 branches [75.04%]
9.015464576 124,204,235,423 branches [75.03%]
9.015858587 124,204,988,490 branches [75.04%]
9.016243680 124,220,092,486 branches [74.99%]
9.016620104 124,231,260,146 branches [74.94%]
And vice-versa if the syadmin does:
$ echo 0 >/sys/devices/cpu/allow_tsx_force_abort
Events are again spread over the 4 counters:
10.017096277 124,276,230,565 branches [74.96%]
10.017237209 124,228,062,171 branches [75.03%]
10.017478637 124,178,780,626 branches [75.03%]
10.017853402 124,198,316,177 branches [75.03%]
11.018334423 124,602,418,933 branches [85.40%]
11.018722584 124,602,921,320 branches [85.42%]
11.019095621 124,603,956,093 branches [85.42%]
11.019467742 124,595,273,783 branches [85.42%]
12.019945736 125,110,114,864 branches
12.020330764 125,109,334,472 branches
12.020688740 125,109,818,865 branches
12.021054020 125,108,594,014 branches
13.021516774 125,109,164,018 branches
13.021903640 125,108,794,510 branches
13.022270770 125,107,756,978 branches
13.022630819 125,109,380,471 branches
14.023114989 125,133,140,817 branches
14.023501880 125,133,785,858 branches
14.023868339 125,133,852,700 branches
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: kan.liang@intel.com
Cc: nelson.dsouza@intel.com
Cc: tonyj@suse.com
Link: https://lkml.kernel.org/r/20190408173252.37932-3-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-04-09 01:32:52 +08:00
|
|
|
static DEVICE_ATTR(allow_tsx_force_abort, 0644,
|
|
|
|
show_sysctl_tfa,
|
|
|
|
set_sysctl_tfa);
|
2019-03-06 05:23:18 +08:00
|
|
|
|
2017-05-12 22:51:13 +08:00
|
|
|
static struct attribute *intel_pmu_attrs[] = {
|
|
|
|
&dev_attr_freeze_on_smi.attr,
|
2019-05-24 21:21:52 +08:00
|
|
|
&dev_attr_allow_tsx_force_abort.attr,
|
2017-05-12 22:51:13 +08:00
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
static umode_t
|
|
|
|
tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i)
|
2018-09-06 21:57:48 +08:00
|
|
|
{
|
2019-05-12 23:55:13 +08:00
|
|
|
return boot_cpu_has(X86_FEATURE_RTM) ? attr->mode : 0;
|
|
|
|
}
|
2018-09-06 21:57:48 +08:00
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
static umode_t
|
|
|
|
pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i)
|
|
|
|
{
|
|
|
|
return x86_pmu.pebs ? attr->mode : 0;
|
|
|
|
}
|
2018-09-06 21:57:48 +08:00
|
|
|
|
2019-05-12 23:55:15 +08:00
|
|
|
static umode_t
|
|
|
|
lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i)
|
|
|
|
{
|
|
|
|
return x86_pmu.lbr_nr ? attr->mode : 0;
|
|
|
|
}
|
|
|
|
|
2019-05-12 23:55:16 +08:00
|
|
|
static umode_t
|
|
|
|
exra_is_visible(struct kobject *kobj, struct attribute *attr, int i)
|
|
|
|
{
|
|
|
|
return x86_pmu.version >= 2 ? attr->mode : 0;
|
|
|
|
}
|
|
|
|
|
2019-05-24 21:21:52 +08:00
|
|
|
static umode_t
|
|
|
|
default_is_visible(struct kobject *kobj, struct attribute *attr, int i)
|
|
|
|
{
|
|
|
|
if (attr == &dev_attr_allow_tsx_force_abort.attr)
|
|
|
|
return x86_pmu.flags & PMU_FL_TFA ? attr->mode : 0;
|
|
|
|
|
|
|
|
return attr->mode;
|
|
|
|
}
|
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
static struct attribute_group group_events_td = {
|
|
|
|
.name = "events",
|
|
|
|
};
|
2018-09-06 21:57:48 +08:00
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
static struct attribute_group group_events_mem = {
|
|
|
|
.name = "events",
|
|
|
|
.is_visible = pebs_is_visible,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct attribute_group group_events_tsx = {
|
|
|
|
.name = "events",
|
|
|
|
.is_visible = tsx_is_visible,
|
|
|
|
};
|
|
|
|
|
2019-05-12 23:55:15 +08:00
|
|
|
static struct attribute_group group_caps_gen = {
|
|
|
|
.name = "caps",
|
|
|
|
.attrs = intel_pmu_caps_attrs,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct attribute_group group_caps_lbr = {
|
|
|
|
.name = "caps",
|
|
|
|
.attrs = lbr_attrs,
|
|
|
|
.is_visible = lbr_is_visible,
|
|
|
|
};
|
|
|
|
|
2019-05-12 23:55:16 +08:00
|
|
|
static struct attribute_group group_format_extra = {
|
|
|
|
.name = "format",
|
|
|
|
.is_visible = exra_is_visible,
|
|
|
|
};
|
|
|
|
|
2019-05-12 23:55:17 +08:00
|
|
|
static struct attribute_group group_format_extra_skl = {
|
|
|
|
.name = "format",
|
|
|
|
.is_visible = exra_is_visible,
|
|
|
|
};
|
|
|
|
|
2019-05-12 23:55:18 +08:00
|
|
|
static struct attribute_group group_default = {
|
2019-05-24 21:21:52 +08:00
|
|
|
.attrs = intel_pmu_attrs,
|
|
|
|
.is_visible = default_is_visible,
|
2019-05-12 23:55:18 +08:00
|
|
|
};
|
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
static const struct attribute_group *attr_update[] = {
|
|
|
|
&group_events_td,
|
|
|
|
&group_events_mem,
|
|
|
|
&group_events_tsx,
|
2019-05-12 23:55:15 +08:00
|
|
|
&group_caps_gen,
|
|
|
|
&group_caps_lbr,
|
2019-05-12 23:55:16 +08:00
|
|
|
&group_format_extra,
|
2019-05-12 23:55:17 +08:00
|
|
|
&group_format_extra_skl,
|
2019-05-12 23:55:18 +08:00
|
|
|
&group_default,
|
2019-05-12 23:55:13 +08:00
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct attribute *empty_attrs;
|
2018-09-06 21:57:48 +08:00
|
|
|
|
2021-04-12 22:30:51 +08:00
|
|
|
static void intel_pmu_check_num_counters(int *num_counters,
|
|
|
|
int *num_counters_fixed,
|
|
|
|
u64 *intel_ctrl, u64 fixed_mask)
|
|
|
|
{
|
|
|
|
if (*num_counters > INTEL_PMC_MAX_GENERIC) {
|
|
|
|
WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
|
|
|
|
*num_counters, INTEL_PMC_MAX_GENERIC);
|
|
|
|
*num_counters = INTEL_PMC_MAX_GENERIC;
|
|
|
|
}
|
|
|
|
*intel_ctrl = (1ULL << *num_counters) - 1;
|
|
|
|
|
|
|
|
if (*num_counters_fixed > INTEL_PMC_MAX_FIXED) {
|
|
|
|
WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
|
|
|
|
*num_counters_fixed, INTEL_PMC_MAX_FIXED);
|
|
|
|
*num_counters_fixed = INTEL_PMC_MAX_FIXED;
|
|
|
|
}
|
|
|
|
|
|
|
|
*intel_ctrl |= fixed_mask << INTEL_PMC_IDX_FIXED;
|
|
|
|
}
|
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
__init int intel_pmu_init(void)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2019-05-12 23:55:17 +08:00
|
|
|
struct attribute **extra_skl_attr = &empty_attrs;
|
2019-05-12 23:55:13 +08:00
|
|
|
struct attribute **extra_attr = &empty_attrs;
|
|
|
|
struct attribute **td_attr = &empty_attrs;
|
|
|
|
struct attribute **mem_attr = &empty_attrs;
|
|
|
|
struct attribute **tsx_attr = &empty_attrs;
|
2010-02-26 19:05:05 +08:00
|
|
|
union cpuid10_edx edx;
|
|
|
|
union cpuid10_eax eax;
|
2011-11-10 20:57:26 +08:00
|
|
|
union cpuid10_ebx ebx;
|
2012-06-21 02:46:34 +08:00
|
|
|
struct event_constraint *c;
|
2021-01-29 06:40:11 +08:00
|
|
|
unsigned int fixed_mask;
|
2014-07-15 03:25:56 +08:00
|
|
|
struct extra_reg *er;
|
2019-06-03 21:41:21 +08:00
|
|
|
bool pmem = false;
|
2014-07-15 03:25:56 +08:00
|
|
|
int version, i;
|
2017-08-23 02:52:01 +08:00
|
|
|
char *name;
|
2010-02-26 19:05:05 +08:00
|
|
|
|
|
|
|
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
|
2010-03-12 00:54:39 +08:00
|
|
|
switch (boot_cpu_data.x86) {
|
|
|
|
case 0x6:
|
|
|
|
return p6_pmu_init();
|
2012-09-27 02:12:52 +08:00
|
|
|
case 0xb:
|
|
|
|
return knc_pmu_init();
|
2010-03-12 00:54:39 +08:00
|
|
|
case 0xf:
|
|
|
|
return p4_pmu_init();
|
|
|
|
}
|
2010-02-26 19:05:05 +08:00
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether the Architectural PerfMon supports
|
|
|
|
* Branch Misses Retired hw_event or not.
|
|
|
|
*/
|
2021-01-29 06:40:11 +08:00
|
|
|
cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full);
|
2011-11-10 20:57:26 +08:00
|
|
|
if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
|
2010-02-26 19:05:05 +08:00
|
|
|
return -ENODEV;
|
|
|
|
|
|
|
|
version = eax.split.version_id;
|
|
|
|
if (version < 2)
|
|
|
|
x86_pmu = core_pmu;
|
|
|
|
else
|
|
|
|
x86_pmu = intel_pmu;
|
|
|
|
|
|
|
|
x86_pmu.version = version;
|
2010-03-30 00:36:50 +08:00
|
|
|
x86_pmu.num_counters = eax.split.num_counters;
|
|
|
|
x86_pmu.cntval_bits = eax.split.bit_width;
|
|
|
|
x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
x86_pmu.events_maskl = ebx.full;
|
|
|
|
x86_pmu.events_mask_len = eax.split.mask_length;
|
|
|
|
|
2012-06-06 08:56:48 +08:00
|
|
|
x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
|
|
|
* Quirk: v2 perfmon does not report fixed-purpose events, so
|
2016-10-21 16:18:59 +08:00
|
|
|
* assume at least 3 events, when not running in a hypervisor:
|
2010-02-26 19:05:05 +08:00
|
|
|
*/
|
2021-01-29 06:40:11 +08:00
|
|
|
if (version > 1 && version < 5) {
|
2016-10-21 16:18:59 +08:00
|
|
|
int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR);
|
|
|
|
|
|
|
|
x86_pmu.num_counters_fixed =
|
|
|
|
max((int)edx.split.num_counters_fixed, assume);
|
2021-01-29 06:40:11 +08:00
|
|
|
|
|
|
|
fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1;
|
|
|
|
} else if (version >= 5)
|
|
|
|
x86_pmu.num_counters_fixed = fls(fixed_mask);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2014-02-03 21:29:03 +08:00
|
|
|
if (boot_cpu_has(X86_FEATURE_PDCM)) {
|
2010-03-04 00:07:40 +08:00
|
|
|
u64 capabilities;
|
|
|
|
|
|
|
|
rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
|
|
|
|
x86_pmu.intel_cap.capabilities = capabilities;
|
|
|
|
}
|
|
|
|
|
2020-07-03 20:49:09 +08:00
|
|
|
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) {
|
2020-07-03 20:49:08 +08:00
|
|
|
x86_pmu.lbr_reset = intel_pmu_lbr_reset_32;
|
2020-07-03 20:49:09 +08:00
|
|
|
x86_pmu.lbr_read = intel_pmu_lbr_read_32;
|
|
|
|
}
|
2020-07-03 20:49:08 +08:00
|
|
|
|
perf/x86/intel/lbr: Support Architectural LBR
Last Branch Records (LBR) enables recording of software path history by
logging taken branches and other control flows within architectural
registers now. Intel CPUs have had model-specific LBR for quite some
time, but this evolves them into an architectural feature now.
The main improvements of Architectural LBR implemented includes:
- Linux kernel can support the LBR features without knowing the model
number of the current CPU.
- Architectural LBR capabilities can be enumerated by CPUID. The
lbr_ctl_map is based on the CPUID Enumeration.
- The possible LBR depth can be retrieved from CPUID enumeration. The
max value is written to the new MSR_ARCH_LBR_DEPTH as the number of
LBR entries.
- A new IA32_LBR_CTL MSR is introduced to enable and configure LBRs,
which replaces the IA32_DEBUGCTL[bit 0] and the LBR_SELECT MSR.
- Each LBR record or entry is still comprised of three MSRs,
IA32_LBR_x_FROM_IP, IA32_LBR_x_TO_IP and IA32_LBR_x_TO_IP.
But they become the architectural MSRs.
- Architectural LBR is stack-like now. Entry 0 is always the youngest
branch, entry 1 the next youngest... The TOS MSR has been removed.
The way to enable/disable Architectural LBR is similar to the previous
model-specific LBR. __intel_pmu_lbr_enable/disable() can be reused, but
some modifications are required, which include:
- MSR_ARCH_LBR_CTL is used to enable and configure the Architectural
LBR.
- When checking the value of the IA32_DEBUGCTL MSR, ignoring the
DEBUGCTLMSR_LBR (bit 0) for Architectural LBR, which has no meaning
and always return 0.
- The FREEZE_LBRS_ON_PMI has to be explicitly set/clear, because
MSR_IA32_DEBUGCTLMSR is not touched in __intel_pmu_lbr_disable() for
Architectural LBR.
- Only MSR_ARCH_LBR_CTL is cleared in __intel_pmu_lbr_disable() for
Architectural LBR.
Some Architectural LBR dedicated functions are implemented to
reset/read/save/restore LBR.
- For reset, writing to the ARCH_LBR_DEPTH MSR clears all Arch LBR
entries, which is a lot faster and can improve the context switch
latency.
- For read, the branch type information can be retrieved from
the MSR_ARCH_LBR_INFO_*. But it's not fully compatible due to
OTHER_BRANCH type. The software decoding is still required for the
OTHER_BRANCH case.
LBR records are stored in the age order as well. Reuse
intel_pmu_store_lbr(). Check the CPUID enumeration before accessing
the corresponding bits in LBR_INFO.
- For save/restore, applying the fast reset (writing ARCH_LBR_DEPTH).
Reading 'lbr_from' of entry 0 instead of the TOS MSR to check if the
LBR registers are reset in the deep C-state. If 'the deep C-state
reset' bit is not set in CPUID enumeration, ignoring the check.
XSAVE support for Architectural LBR will be implemented later.
The number of LBR entries cannot be hardcoded anymore, which should be
retrieved from CPUID enumeration. A new structure
x86_perf_task_context_arch_lbr is introduced for Architectural LBR.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-15-git-send-email-kan.liang@linux.intel.com
2020-07-03 20:49:20 +08:00
|
|
|
if (boot_cpu_has(X86_FEATURE_ARCH_LBR))
|
|
|
|
intel_pmu_arch_lbr_init();
|
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
intel_ds_init();
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
|
|
|
|
|
2020-10-29 03:42:47 +08:00
|
|
|
if (version >= 5) {
|
|
|
|
x86_pmu.intel_cap.anythread_deprecated = edx.split.anythread_deprecated;
|
|
|
|
if (x86_pmu.intel_cap.anythread_deprecated)
|
|
|
|
pr_cont(" AnyThread deprecated, ");
|
|
|
|
}
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
|
|
|
* Install the hw-cache-events table:
|
|
|
|
*/
|
|
|
|
switch (boot_cpu_data.x86_model) {
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_CORE_YONAH:
|
2010-02-26 19:05:05 +08:00
|
|
|
pr_cont("Core events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "core";
|
2010-02-26 19:05:05 +08:00
|
|
|
break;
|
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_CORE2_MEROM:
|
2011-12-06 21:07:15 +08:00
|
|
|
x86_add_quirk(intel_clovertown_quirk);
|
2020-08-24 06:36:59 +08:00
|
|
|
fallthrough;
|
2019-01-26 02:49:17 +08:00
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_CORE2_MEROM_L:
|
|
|
|
case INTEL_FAM6_CORE2_PENRYN:
|
|
|
|
case INTEL_FAM6_CORE2_DUNNINGTON:
|
2010-02-26 19:05:05 +08:00
|
|
|
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
|
2010-03-03 19:02:30 +08:00
|
|
|
intel_pmu_lbr_init_core();
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
x86_pmu.event_constraints = intel_core2_event_constraints;
|
2011-03-02 23:05:01 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
|
2010-02-26 19:05:05 +08:00
|
|
|
pr_cont("Core2 events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "core2";
|
2010-02-26 19:05:05 +08:00
|
|
|
break;
|
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_NEHALEM:
|
|
|
|
case INTEL_FAM6_NEHALEM_EP:
|
|
|
|
case INTEL_FAM6_NEHALEM_EX:
|
2010-02-26 19:05:05 +08:00
|
|
|
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
2011-03-03 10:34:48 +08:00
|
|
|
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2010-03-03 19:02:30 +08:00
|
|
|
intel_pmu_lbr_init_nhm();
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
x86_pmu.event_constraints = intel_nehalem_event_constraints;
|
2011-03-02 23:05:01 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
|
2010-03-26 21:08:44 +08:00
|
|
|
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
|
2011-03-03 10:34:47 +08:00
|
|
|
x86_pmu.extra_regs = intel_nehalem_extra_regs;
|
2019-08-20 07:13:31 +08:00
|
|
|
x86_pmu.limit_period = nhm_limit_period;
|
2011-04-27 17:51:41 +08:00
|
|
|
|
2018-09-06 21:57:48 +08:00
|
|
|
mem_attr = nhm_mem_events_attrs;
|
2013-01-24 23:10:32 +08:00
|
|
|
|
2011-04-29 20:17:19 +08:00
|
|
|
/* UOPS_ISSUED.STALLED_CYCLES */
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
|
|
|
|
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
|
2011-04-29 20:17:19 +08:00
|
|
|
/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
|
|
|
|
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
|
2011-04-24 14:18:31 +08:00
|
|
|
|
2016-03-02 06:25:24 +08:00
|
|
|
intel_pmu_pebs_data_source_nhm();
|
2011-12-06 21:07:15 +08:00
|
|
|
x86_add_quirk(intel_nehalem_quirk);
|
2017-08-17 06:21:53 +08:00
|
|
|
x86_pmu.pebs_no_tlb = 1;
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = nhm_format_attr;
|
2011-04-27 17:51:41 +08:00
|
|
|
|
2010-03-26 21:08:44 +08:00
|
|
|
pr_cont("Nehalem events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "nehalem";
|
2010-02-26 19:05:05 +08:00
|
|
|
break;
|
2010-03-03 19:02:30 +08:00
|
|
|
|
2018-08-08 01:17:27 +08:00
|
|
|
case INTEL_FAM6_ATOM_BONNELL:
|
|
|
|
case INTEL_FAM6_ATOM_BONNELL_MID:
|
|
|
|
case INTEL_FAM6_ATOM_SALTWELL:
|
|
|
|
case INTEL_FAM6_ATOM_SALTWELL_MID:
|
|
|
|
case INTEL_FAM6_ATOM_SALTWELL_TABLET:
|
2010-02-26 19:05:05 +08:00
|
|
|
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
|
2010-03-03 19:02:30 +08:00
|
|
|
intel_pmu_lbr_init_atom();
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
x86_pmu.event_constraints = intel_gen_event_constraints;
|
2011-03-02 23:05:01 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
|
2015-12-04 04:03:10 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
|
2010-02-26 19:05:05 +08:00
|
|
|
pr_cont("Atom events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "bonnell";
|
2010-02-26 19:05:05 +08:00
|
|
|
break;
|
|
|
|
|
2018-08-08 01:17:27 +08:00
|
|
|
case INTEL_FAM6_ATOM_SILVERMONT:
|
2019-08-28 03:48:24 +08:00
|
|
|
case INTEL_FAM6_ATOM_SILVERMONT_D:
|
2018-08-08 01:17:27 +08:00
|
|
|
case INTEL_FAM6_ATOM_SILVERMONT_MID:
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_ATOM_AIRMONT:
|
2018-08-08 01:17:27 +08:00
|
|
|
case INTEL_FAM6_ATOM_AIRMONT_MID:
|
2013-07-18 17:02:24 +08:00
|
|
|
memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
|
|
|
|
2016-04-15 15:53:45 +08:00
|
|
|
intel_pmu_lbr_init_slm();
|
2013-07-18 17:02:24 +08:00
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_slm_event_constraints;
|
|
|
|
x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_slm_extra_regs;
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = slm_events_attrs;
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = slm_format_attr;
|
2013-07-18 17:02:24 +08:00
|
|
|
pr_cont("Silvermont events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "silvermont";
|
2013-07-18 17:02:24 +08:00
|
|
|
break;
|
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_ATOM_GOLDMONT:
|
2019-08-28 03:48:24 +08:00
|
|
|
case INTEL_FAM6_ATOM_GOLDMONT_D:
|
2016-04-15 15:42:47 +08:00
|
|
|
memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
|
|
|
|
|
|
|
intel_pmu_lbr_init_skl();
|
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_slm_event_constraints;
|
|
|
|
x86_pmu.pebs_constraints = intel_glm_pebs_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_glm_extra_regs;
|
|
|
|
/*
|
|
|
|
* It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
|
|
|
|
* for precise cycles.
|
|
|
|
* :pp is identical to :ppp
|
|
|
|
*/
|
|
|
|
x86_pmu.pebs_aliases = NULL;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
2016-04-28 23:35:46 +08:00
|
|
|
x86_pmu.lbr_pt_coexist = true;
|
2016-04-15 15:42:47 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = glm_events_attrs;
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = slm_format_attr;
|
2016-04-15 15:42:47 +08:00
|
|
|
pr_cont("Goldmont events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "goldmont";
|
2016-04-15 15:42:47 +08:00
|
|
|
break;
|
|
|
|
|
2018-08-08 01:17:27 +08:00
|
|
|
case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
|
2017-07-12 21:44:23 +08:00
|
|
|
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
|
|
|
|
|
|
|
intel_pmu_lbr_init_skl();
|
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_slm_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_glm_extra_regs;
|
|
|
|
/*
|
|
|
|
* It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
|
|
|
|
* for precise cycles.
|
|
|
|
*/
|
|
|
|
x86_pmu.pebs_aliases = NULL;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
|
|
|
x86_pmu.lbr_pt_coexist = true;
|
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
2018-03-09 10:15:42 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_PEBS_ALL;
|
2017-07-12 21:44:23 +08:00
|
|
|
x86_pmu.get_event_constraints = glp_get_event_constraints;
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = glm_events_attrs;
|
2017-07-12 21:44:23 +08:00
|
|
|
/* Goldmont Plus has 4-wide pipeline */
|
|
|
|
event_attr_td_total_slots_scale_glm.event_str = "4";
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = slm_format_attr;
|
2017-07-12 21:44:23 +08:00
|
|
|
pr_cont("Goldmont plus events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "goldmont_plus";
|
2017-07-12 21:44:23 +08:00
|
|
|
break;
|
|
|
|
|
2019-08-28 03:48:24 +08:00
|
|
|
case INTEL_FAM6_ATOM_TREMONT_D:
|
2020-01-29 02:31:17 +08:00
|
|
|
case INTEL_FAM6_ATOM_TREMONT:
|
2020-09-28 20:30:41 +08:00
|
|
|
case INTEL_FAM6_ATOM_TREMONT_L:
|
2019-04-11 02:57:09 +08:00
|
|
|
x86_pmu.late_ack = true;
|
|
|
|
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
|
|
|
hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
|
|
|
|
|
|
|
|
intel_pmu_lbr_init_skl();
|
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_slm_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_tnt_extra_regs;
|
|
|
|
/*
|
|
|
|
* It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
|
|
|
|
* for precise cycles.
|
|
|
|
*/
|
|
|
|
x86_pmu.pebs_aliases = NULL;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
|
|
|
x86_pmu.lbr_pt_coexist = true;
|
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.get_event_constraints = tnt_get_event_constraints;
|
2020-12-09 04:05:52 +08:00
|
|
|
td_attr = tnt_events_attrs;
|
2019-04-11 02:57:09 +08:00
|
|
|
extra_attr = slm_format_attr;
|
|
|
|
pr_cont("Tremont events, ");
|
|
|
|
name = "Tremont";
|
|
|
|
break;
|
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_WESTMERE:
|
|
|
|
case INTEL_FAM6_WESTMERE_EP:
|
|
|
|
case INTEL_FAM6_WESTMERE_EX:
|
2010-02-26 19:05:05 +08:00
|
|
|
memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
2011-03-03 10:34:48 +08:00
|
|
|
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2010-03-03 19:02:30 +08:00
|
|
|
intel_pmu_lbr_init_nhm();
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
x86_pmu.event_constraints = intel_westmere_event_constraints;
|
2010-03-29 22:37:17 +08:00
|
|
|
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
|
2011-03-02 23:05:01 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
|
2011-03-03 10:34:47 +08:00
|
|
|
x86_pmu.extra_regs = intel_westmere_extra_regs;
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
2011-04-30 15:14:54 +08:00
|
|
|
|
2018-09-06 21:57:48 +08:00
|
|
|
mem_attr = nhm_mem_events_attrs;
|
2013-01-24 23:10:32 +08:00
|
|
|
|
2011-04-30 15:14:54 +08:00
|
|
|
/* UOPS_ISSUED.STALLED_CYCLES */
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
|
|
|
|
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
|
2011-04-30 15:14:54 +08:00
|
|
|
/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
|
|
|
|
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
|
2011-04-30 15:14:54 +08:00
|
|
|
|
2016-03-02 06:25:24 +08:00
|
|
|
intel_pmu_pebs_data_source_nhm();
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = nhm_format_attr;
|
2010-02-26 19:05:05 +08:00
|
|
|
pr_cont("Westmere events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "westmere";
|
2010-02-26 19:05:05 +08:00
|
|
|
break;
|
2010-02-01 22:36:30 +08:00
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_SANDYBRIDGE:
|
|
|
|
case INTEL_FAM6_SANDYBRIDGE_X:
|
2012-06-05 16:26:43 +08:00
|
|
|
x86_add_quirk(intel_sandybridge_quirk);
|
2014-11-18 03:06:59 +08:00
|
|
|
x86_add_quirk(intel_ht_bug);
|
2011-03-02 21:27:04 +08:00
|
|
|
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
2012-07-17 17:27:55 +08:00
|
|
|
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
2011-03-02 21:27:04 +08:00
|
|
|
|
2012-02-10 06:20:55 +08:00
|
|
|
intel_pmu_lbr_init_snb();
|
2011-03-02 21:27:04 +08:00
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_snb_event_constraints;
|
2011-08-31 07:41:05 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
|
2012-06-05 16:26:43 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
|
2016-06-03 08:19:29 +08:00
|
|
|
if (boot_cpu_data.x86_model == INTEL_FAM6_SANDYBRIDGE_X)
|
2013-04-16 19:51:43 +08:00
|
|
|
x86_pmu.extra_regs = intel_snbep_extra_regs;
|
|
|
|
else
|
|
|
|
x86_pmu.extra_regs = intel_snb_extra_regs;
|
2014-11-18 03:06:59 +08:00
|
|
|
|
|
|
|
|
2011-06-06 22:57:12 +08:00
|
|
|
/* all extra regs are per-cpu when HT is on */
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
2011-05-06 15:14:02 +08:00
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = snb_events_attrs;
|
2018-09-06 21:57:48 +08:00
|
|
|
mem_attr = snb_mem_events_attrs;
|
2013-01-24 23:10:32 +08:00
|
|
|
|
2011-05-06 15:14:02 +08:00
|
|
|
/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
|
|
|
|
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
|
2011-05-06 15:14:02 +08:00
|
|
|
/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
|
|
|
|
X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
|
2011-05-06 15:14:02 +08:00
|
|
|
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = nhm_format_attr;
|
|
|
|
|
2011-03-02 21:27:04 +08:00
|
|
|
pr_cont("SandyBridge events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "sandybridge";
|
2011-03-02 21:27:04 +08:00
|
|
|
break;
|
2014-07-30 18:08:56 +08:00
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_IVYBRIDGE:
|
|
|
|
case INTEL_FAM6_IVYBRIDGE_X:
|
2014-11-18 03:06:59 +08:00
|
|
|
x86_add_quirk(intel_ht_bug);
|
2012-09-11 07:07:01 +08:00
|
|
|
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
2014-07-15 03:33:25 +08:00
|
|
|
/* dTLB-load-misses on IVB is different than SNB */
|
|
|
|
hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
|
|
|
|
|
2012-09-11 07:07:01 +08:00
|
|
|
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
|
|
|
|
|
|
|
intel_pmu_lbr_init_snb();
|
|
|
|
|
2013-02-20 18:15:12 +08:00
|
|
|
x86_pmu.event_constraints = intel_ivb_event_constraints;
|
2012-09-11 07:07:01 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
2016-06-03 08:19:29 +08:00
|
|
|
if (boot_cpu_data.x86_model == INTEL_FAM6_IVYBRIDGE_X)
|
2013-04-16 19:51:43 +08:00
|
|
|
x86_pmu.extra_regs = intel_snbep_extra_regs;
|
|
|
|
else
|
|
|
|
x86_pmu.extra_regs = intel_snb_extra_regs;
|
2012-09-11 07:07:01 +08:00
|
|
|
/* all extra regs are per-cpu when HT is on */
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
2012-09-11 07:07:01 +08:00
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = snb_events_attrs;
|
2018-09-06 21:57:48 +08:00
|
|
|
mem_attr = snb_mem_events_attrs;
|
2013-01-24 23:10:32 +08:00
|
|
|
|
2012-09-11 07:07:01 +08:00
|
|
|
/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
|
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
|
|
|
|
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
|
|
|
|
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = nhm_format_attr;
|
|
|
|
|
2012-09-11 07:07:01 +08:00
|
|
|
pr_cont("IvyBridge events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "ivybridge";
|
2012-09-11 07:07:01 +08:00
|
|
|
break;
|
|
|
|
|
2011-03-02 21:27:04 +08:00
|
|
|
|
2019-08-28 03:48:21 +08:00
|
|
|
case INTEL_FAM6_HASWELL:
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_HASWELL_X:
|
2019-08-28 03:48:22 +08:00
|
|
|
case INTEL_FAM6_HASWELL_L:
|
2019-08-28 03:48:23 +08:00
|
|
|
case INTEL_FAM6_HASWELL_G:
|
2014-11-18 03:06:59 +08:00
|
|
|
x86_add_quirk(intel_ht_bug);
|
2019-02-05 06:23:30 +08:00
|
|
|
x86_add_quirk(intel_pebs_isolation_quirk);
|
2013-06-18 08:36:50 +08:00
|
|
|
x86_pmu.late_ack = true;
|
2015-02-18 10:18:04 +08:00
|
|
|
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
2013-06-18 08:36:48 +08:00
|
|
|
|
perf/x86/intel: Add basic Haswell LBR call stack support
Haswell has a new feature that utilizes the existing LBR facility to
record call chains. To enable this feature, bits (JCC, NEAR_IND_JMP,
NEAR_REL_JMP, FAR_BRANCH, EN_CALLSTACK) in LBR_SELECT must be set to 1,
bits (NEAR_REL_CALL, NEAR-IND_CALL, NEAR_RET) must be cleared. Due to
a hardware bug of Haswell, this feature doesn't work well with
FREEZE_LBRS_ON_PMI.
When the call stack feature is enabled, the LBR stack will capture
unfiltered call data normally, but as return instructions are executed,
the last captured branch record is flushed from the on-chip registers
in a last-in first-out (LIFO) manner. Thus, branch information relative
to leaf functions will not be captured, while preserving the call stack
information of the main line execution path.
This patch defines a separate lbr_sel map for Haswell. The map contains
a new entry for the call stack feature.
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-5-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-05 10:56:00 +08:00
|
|
|
intel_pmu_lbr_init_hsw();
|
2013-06-18 08:36:48 +08:00
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_hsw_event_constraints;
|
2013-06-18 08:36:49 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
|
2014-08-01 05:05:22 +08:00
|
|
|
x86_pmu.extra_regs = intel_snbep_extra_regs;
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
2013-06-18 08:36:48 +08:00
|
|
|
/* all extra regs are per-cpu when HT is on */
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
2013-06-18 08:36:48 +08:00
|
|
|
|
|
|
|
x86_pmu.hw_config = hsw_hw_config;
|
|
|
|
x86_pmu.get_event_constraints = hsw_get_event_constraints;
|
2013-09-20 22:40:44 +08:00
|
|
|
x86_pmu.lbr_double_abort = true;
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
|
|
|
|
hsw_format_attr : nhm_format_attr;
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = hsw_events_attrs;
|
2018-09-06 21:57:48 +08:00
|
|
|
mem_attr = hsw_mem_events_attrs;
|
|
|
|
tsx_attr = hsw_tsx_events_attrs;
|
2013-06-18 08:36:48 +08:00
|
|
|
pr_cont("Haswell events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "haswell";
|
2013-06-18 08:36:48 +08:00
|
|
|
break;
|
|
|
|
|
2019-08-28 03:48:21 +08:00
|
|
|
case INTEL_FAM6_BROADWELL:
|
2019-08-28 03:48:24 +08:00
|
|
|
case INTEL_FAM6_BROADWELL_D:
|
2019-08-28 03:48:23 +08:00
|
|
|
case INTEL_FAM6_BROADWELL_G:
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_BROADWELL_X:
|
2019-02-05 06:23:30 +08:00
|
|
|
x86_add_quirk(intel_pebs_isolation_quirk);
|
2015-02-18 10:18:05 +08:00
|
|
|
x86_pmu.late_ack = true;
|
|
|
|
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
|
|
|
|
|
|
|
/* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
|
|
|
|
hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
|
|
|
|
BDW_L3_MISS|HSW_SNOOP_DRAM;
|
|
|
|
hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
|
|
|
|
HSW_SNOOP_DRAM;
|
|
|
|
hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
|
|
|
|
BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
|
|
|
|
hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
|
|
|
|
BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
|
|
|
|
|
2015-04-02 16:12:57 +08:00
|
|
|
intel_pmu_lbr_init_hsw();
|
2015-02-18 10:18:05 +08:00
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_bdw_event_constraints;
|
2016-03-04 03:50:42 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_bdw_pebs_event_constraints;
|
2015-02-18 10:18:05 +08:00
|
|
|
x86_pmu.extra_regs = intel_snbep_extra_regs;
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
2015-02-18 10:18:05 +08:00
|
|
|
/* all extra regs are per-cpu when HT is on */
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
2015-02-18 10:18:05 +08:00
|
|
|
|
|
|
|
x86_pmu.hw_config = hsw_hw_config;
|
|
|
|
x86_pmu.get_event_constraints = hsw_get_event_constraints;
|
perf/x86/intel: Add INST_RETIRED.ALL workarounds
On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.
This is erratum BDM11 and BDM55:
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.
Add a new callback to enforce this, and set it for Broadwell.
How does this handle the case when an app requests a specific
period with some of the bottom bits set?
Short answer:
Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.
Long answer (by Peterz):
IFF we guarantee perf_event_attr::sample_period >= 128.
Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.
So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.
So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.
So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-02-18 10:18:06 +08:00
|
|
|
x86_pmu.limit_period = bdw_limit_period;
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
|
|
|
|
hsw_format_attr : nhm_format_attr;
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = hsw_events_attrs;
|
2018-09-06 21:57:48 +08:00
|
|
|
mem_attr = hsw_mem_events_attrs;
|
|
|
|
tsx_attr = hsw_tsx_events_attrs;
|
2015-02-18 10:18:05 +08:00
|
|
|
pr_cont("Broadwell events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "broadwell";
|
2015-02-18 10:18:05 +08:00
|
|
|
break;
|
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_XEON_PHI_KNL:
|
2016-10-13 02:26:34 +08:00
|
|
|
case INTEL_FAM6_XEON_PHI_KNM:
|
2015-12-08 06:28:18 +08:00
|
|
|
memcpy(hw_cache_event_ids,
|
|
|
|
slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs,
|
|
|
|
knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
|
|
|
intel_pmu_lbr_init_knl();
|
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_slm_event_constraints;
|
|
|
|
x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_knl_extra_regs;
|
|
|
|
|
|
|
|
/* all extra regs are per-cpu when HT is on */
|
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = slm_format_attr;
|
2016-10-13 02:26:34 +08:00
|
|
|
pr_cont("Knights Landing/Mill events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "knights-landing";
|
2015-12-08 06:28:18 +08:00
|
|
|
break;
|
|
|
|
|
2019-06-03 21:41:21 +08:00
|
|
|
case INTEL_FAM6_SKYLAKE_X:
|
|
|
|
pmem = true;
|
2020-08-24 06:36:59 +08:00
|
|
|
fallthrough;
|
2019-08-28 03:48:22 +08:00
|
|
|
case INTEL_FAM6_SKYLAKE_L:
|
2019-08-28 03:48:21 +08:00
|
|
|
case INTEL_FAM6_SKYLAKE:
|
2019-08-28 03:48:22 +08:00
|
|
|
case INTEL_FAM6_KABYLAKE_L:
|
2019-08-28 03:48:21 +08:00
|
|
|
case INTEL_FAM6_KABYLAKE:
|
2019-10-08 23:50:03 +08:00
|
|
|
case INTEL_FAM6_COMETLAKE_L:
|
|
|
|
case INTEL_FAM6_COMETLAKE:
|
2019-02-05 06:23:30 +08:00
|
|
|
x86_add_quirk(intel_pebs_isolation_quirk);
|
2015-05-11 03:22:44 +08:00
|
|
|
x86_pmu.late_ack = true;
|
|
|
|
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
|
|
|
intel_pmu_lbr_init_skl();
|
|
|
|
|
2016-05-20 08:09:57 +08:00
|
|
|
/* INT_MISC.RECOVERY_CYCLES has umask 1 in Skylake */
|
|
|
|
event_attr_td_recovery_bubbles.event_str_noht =
|
|
|
|
"event=0xd,umask=0x1,cmask=1";
|
|
|
|
event_attr_td_recovery_bubbles.event_str_ht =
|
|
|
|
"event=0xd,umask=0x1,cmask=1,any=1";
|
|
|
|
|
2015-05-11 03:22:44 +08:00
|
|
|
x86_pmu.event_constraints = intel_skl_event_constraints;
|
|
|
|
x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_skl_extra_regs;
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
2015-05-11 03:22:44 +08:00
|
|
|
/* all extra regs are per-cpu when HT is on */
|
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
|
|
|
|
|
|
|
x86_pmu.hw_config = hsw_hw_config;
|
|
|
|
x86_pmu.get_event_constraints = hsw_get_event_constraints;
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
|
|
|
|
hsw_format_attr : nhm_format_attr;
|
2019-05-12 23:55:17 +08:00
|
|
|
extra_skl_attr = skl_format_attr;
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = hsw_events_attrs;
|
2018-09-06 21:57:48 +08:00
|
|
|
mem_attr = hsw_mem_events_attrs;
|
|
|
|
tsx_attr = hsw_tsx_events_attrs;
|
2019-06-03 21:41:21 +08:00
|
|
|
intel_pmu_pebs_data_source_skl(pmem);
|
2019-03-06 05:23:18 +08:00
|
|
|
|
|
|
|
if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
|
|
|
|
x86_pmu.flags |= PMU_FL_TFA;
|
|
|
|
x86_pmu.get_event_constraints = tfa_get_event_constraints;
|
|
|
|
x86_pmu.enable_all = intel_tfa_pmu_enable_all;
|
|
|
|
x86_pmu.commit_scheduling = intel_tfa_commit_scheduling;
|
|
|
|
}
|
|
|
|
|
2015-05-11 03:22:44 +08:00
|
|
|
pr_cont("Skylake events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "skylake";
|
2015-05-11 03:22:44 +08:00
|
|
|
break;
|
|
|
|
|
2019-06-03 21:41:21 +08:00
|
|
|
case INTEL_FAM6_ICELAKE_X:
|
2019-08-28 03:48:24 +08:00
|
|
|
case INTEL_FAM6_ICELAKE_D:
|
2019-06-03 21:41:21 +08:00
|
|
|
pmem = true;
|
2020-08-24 06:36:59 +08:00
|
|
|
fallthrough;
|
2019-08-28 03:48:22 +08:00
|
|
|
case INTEL_FAM6_ICELAKE_L:
|
2019-08-28 03:48:21 +08:00
|
|
|
case INTEL_FAM6_ICELAKE:
|
2019-10-08 23:50:08 +08:00
|
|
|
case INTEL_FAM6_TIGERLAKE_L:
|
|
|
|
case INTEL_FAM6_TIGERLAKE:
|
2020-10-19 23:35:25 +08:00
|
|
|
case INTEL_FAM6_ROCKETLAKE:
|
2019-04-03 03:45:05 +08:00
|
|
|
x86_pmu.late_ack = true;
|
|
|
|
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
|
|
|
hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
|
|
|
|
intel_pmu_lbr_init_skl();
|
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_icl_event_constraints;
|
|
|
|
x86_pmu.pebs_constraints = intel_icl_pebs_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_icl_extra_regs;
|
|
|
|
x86_pmu.pebs_aliases = NULL;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
|
|
|
|
|
|
|
x86_pmu.hw_config = hsw_hw_config;
|
|
|
|
x86_pmu.get_event_constraints = icl_get_event_constraints;
|
|
|
|
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
|
|
|
|
hsw_format_attr : nhm_format_attr;
|
2019-05-12 23:55:17 +08:00
|
|
|
extra_skl_attr = skl_format_attr;
|
2019-05-12 23:55:13 +08:00
|
|
|
mem_attr = icl_events_attrs;
|
2020-07-24 01:11:13 +08:00
|
|
|
td_attr = icl_td_events_attrs;
|
2019-05-12 23:55:13 +08:00
|
|
|
tsx_attr = icl_tsx_events_attrs;
|
2020-11-26 05:37:19 +08:00
|
|
|
x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
|
2019-04-03 03:45:05 +08:00
|
|
|
x86_pmu.lbr_pt_coexist = true;
|
2019-06-03 21:41:21 +08:00
|
|
|
intel_pmu_pebs_data_source_skl(pmem);
|
2021-01-29 06:40:09 +08:00
|
|
|
x86_pmu.num_topdown_events = 4;
|
2020-07-24 01:11:13 +08:00
|
|
|
x86_pmu.update_topdown_event = icl_update_topdown_event;
|
|
|
|
x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
|
2019-04-03 03:45:05 +08:00
|
|
|
pr_cont("Icelake events, ");
|
|
|
|
name = "icelake";
|
|
|
|
break;
|
|
|
|
|
perf/x86/intel: Add perf core PMU support for Sapphire Rapids
Add perf core PMU support for the Intel Sapphire Rapids server, which is
the successor of the Intel Ice Lake server. The enabling code is based
on Ice Lake, but there are several new features introduced.
The event encoding is changed and simplified, e.g., the event codes
which are below 0x90 are restricted to counters 0-3. The event codes
which above 0x90 are likely to have no restrictions. The event
constraints, extra_regs(), and hardware cache events table are changed
accordingly.
A new Precise Distribution (PDist) facility is introduced, which
further minimizes the skid when a precise event is programmed on the GP
counter 0. Enable the Precise Distribution (PDist) facility with :ppp
event. For this facility to work, the period must be initialized with a
value larger than 127. Add spr_limit_period() to apply the limit for
:ppp event.
Two new data source fields, data block & address block, are added in the
PEBS Memory Info Record for the load latency event. To enable the
feature,
- An auxiliary event has to be enabled together with the load latency
event on Sapphire Rapids. A new flag PMU_FL_MEM_LOADS_AUX is
introduced to indicate the case. A new event, mem-loads-aux, is
exposed to sysfs for the user tool.
Add a check in hw_config(). If the auxiliary event is not detected,
return an unique error -ENODATA.
- The union perf_mem_data_src is extended to support the new fields.
- Ice Lake and earlier models do not support block information, but the
fields may be set by HW on some machines. Add pebs_no_block to
explicitly indicate the previous platforms which don't support the new
block fields. Accessing the new block fields are ignored on those
platforms.
A new store Latency facility is introduced, which leverages the PEBS
facility where it can provide additional information about sampled
stores. The additional information includes the data address, memory
auxiliary info (e.g. Data Source, STLB miss) and the latency of the
store access. To enable the facility, the new event (0x02cd) has to be
programed on the GP counter 0. A new flag PERF_X86_EVENT_PEBS_STLAT is
introduced to indicate the event. The store_latency_data() is introduced
to parse the memory auxiliary info.
The layout of access latency field of PEBS Memory Info Record has been
changed. Two latency, instruction latency (bit 15:0) and cache access
latency (bit 47:32) are recorded.
- The cache access latency is similar to previous memory access latency.
For loads, the latency starts by the actual cache access until the
data is returned by the memory subsystem.
For stores, the latency starts when the demand write accesses the L1
data cache and lasts until the cacheline write is completed in the
memory subsystem.
The cache access latency is stored in low 32bits of the sample type
PERF_SAMPLE_WEIGHT_STRUCT.
- The instruction latency starts by the dispatch of the load operation
for execution and lasts until completion of the instruction it belongs
to.
Add a new flag PMU_FL_INSTR_LATENCY to indicate the instruction
latency support. The instruction latency is stored in the bit 47:32
of the sample type PERF_SAMPLE_WEIGHT_STRUCT.
Extends the PERF_METRICS MSR to feature TMA method level 2 metrics. The
lower half of the register is the TMA level 1 metrics (legacy). The
upper half is also divided into four 8-bit fields for the new level 2
metrics. Expose all eight Topdown metrics events to user space.
The full description for the SPR features can be found at Intel
Architecture Instruction Set Extensions and Future Features
Programming Reference, 319433-041.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-5-git-send-email-kan.liang@linux.intel.com
2021-01-29 06:40:10 +08:00
|
|
|
case INTEL_FAM6_SAPPHIRERAPIDS_X:
|
|
|
|
pmem = true;
|
|
|
|
x86_pmu.late_ack = true;
|
|
|
|
memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_spr_event_constraints;
|
|
|
|
x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_spr_extra_regs;
|
|
|
|
x86_pmu.limit_period = spr_limit_period;
|
|
|
|
x86_pmu.pebs_aliases = NULL;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
|
|
|
x86_pmu.pebs_block = true;
|
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
|
|
|
x86_pmu.flags |= PMU_FL_PEBS_ALL;
|
|
|
|
x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
|
|
|
|
x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
|
|
|
|
|
|
|
|
x86_pmu.hw_config = hsw_hw_config;
|
|
|
|
x86_pmu.get_event_constraints = spr_get_event_constraints;
|
|
|
|
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
|
|
|
|
hsw_format_attr : nhm_format_attr;
|
|
|
|
extra_skl_attr = skl_format_attr;
|
|
|
|
mem_attr = spr_events_attrs;
|
|
|
|
td_attr = spr_td_events_attrs;
|
|
|
|
tsx_attr = spr_tsx_events_attrs;
|
|
|
|
x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
|
|
|
|
x86_pmu.lbr_pt_coexist = true;
|
|
|
|
intel_pmu_pebs_data_source_skl(pmem);
|
|
|
|
x86_pmu.num_topdown_events = 8;
|
|
|
|
x86_pmu.update_topdown_event = icl_update_topdown_event;
|
|
|
|
x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
|
|
|
|
pr_cont("Sapphire Rapids events, ");
|
|
|
|
name = "sapphire_rapids";
|
|
|
|
break;
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
default:
|
2011-06-29 23:42:36 +08:00
|
|
|
switch (x86_pmu.version) {
|
|
|
|
case 1:
|
|
|
|
x86_pmu.event_constraints = intel_v1_event_constraints;
|
|
|
|
pr_cont("generic architected perfmon v1, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "generic_arch_v1";
|
2011-06-29 23:42:36 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
/*
|
|
|
|
* default constraints for v2 and up
|
|
|
|
*/
|
|
|
|
x86_pmu.event_constraints = intel_gen_event_constraints;
|
|
|
|
pr_cont("generic architected perfmon, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "generic_arch_v2+";
|
2011-06-29 23:42:36 +08:00
|
|
|
break;
|
|
|
|
}
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
2011-11-10 20:57:26 +08:00
|
|
|
|
2018-10-28 20:58:28 +08:00
|
|
|
snprintf(pmu_name_str, sizeof(pmu_name_str), "%s", name);
|
2017-08-23 02:52:01 +08:00
|
|
|
|
2017-08-23 02:52:00 +08:00
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
group_events_td.attrs = td_attr;
|
|
|
|
group_events_mem.attrs = mem_attr;
|
|
|
|
group_events_tsx.attrs = tsx_attr;
|
2019-05-12 23:55:16 +08:00
|
|
|
group_format_extra.attrs = extra_attr;
|
2019-05-12 23:55:17 +08:00
|
|
|
group_format_extra_skl.attrs = extra_skl_attr;
|
2019-05-12 23:55:13 +08:00
|
|
|
|
|
|
|
x86_pmu.attr_update = attr_update;
|
2018-09-06 21:57:48 +08:00
|
|
|
|
2021-04-12 22:30:51 +08:00
|
|
|
intel_pmu_check_num_counters(&x86_pmu.num_counters,
|
|
|
|
&x86_pmu.num_counters_fixed,
|
|
|
|
&x86_pmu.intel_ctrl,
|
|
|
|
(u64)fixed_mask);
|
2012-06-21 02:46:34 +08:00
|
|
|
|
2020-10-29 03:42:47 +08:00
|
|
|
/* AnyThread may be deprecated on arch perfmon v5 or later */
|
|
|
|
if (x86_pmu.intel_cap.anythread_deprecated)
|
|
|
|
x86_pmu.format_attrs = intel_arch_formats_attr;
|
|
|
|
|
2012-06-21 02:46:34 +08:00
|
|
|
if (x86_pmu.event_constraints) {
|
|
|
|
/*
|
|
|
|
* event on fixed counter2 (REF_CYCLES) only works on this
|
|
|
|
* counter, so do not extend mask to generic counters
|
|
|
|
*/
|
|
|
|
for_each_event_constraint(c, x86_pmu.event_constraints) {
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
/*
|
|
|
|
* Don't extend the topdown slots and metrics
|
|
|
|
* events to the generic counters.
|
|
|
|
*/
|
|
|
|
if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
|
2021-01-29 06:40:11 +08:00
|
|
|
/*
|
|
|
|
* Disable topdown slots and metrics events,
|
|
|
|
* if slots event is not in CPUID.
|
|
|
|
*/
|
|
|
|
if (!(INTEL_PMC_MSK_FIXED_SLOTS & x86_pmu.intel_ctrl))
|
|
|
|
c->idxmsk64 = 0;
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
c->weight = hweight64(c->idxmsk64);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2021-01-29 06:40:11 +08:00
|
|
|
if (c->cmask == FIXED_EVENT_FLAGS) {
|
|
|
|
/* Disabled fixed counters which are not in CPUID */
|
|
|
|
c->idxmsk64 &= x86_pmu.intel_ctrl;
|
|
|
|
|
|
|
|
if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES)
|
|
|
|
c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
|
2012-06-21 02:46:34 +08:00
|
|
|
}
|
2015-06-08 20:46:49 +08:00
|
|
|
c->idxmsk64 &=
|
2016-05-11 21:51:51 +08:00
|
|
|
~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
|
2015-06-08 20:46:49 +08:00
|
|
|
c->weight = hweight64(c->idxmsk64);
|
2012-06-21 02:46:34 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-15 03:25:56 +08:00
|
|
|
/*
|
|
|
|
* Access LBR MSR may cause #GP under certain circumstances.
|
|
|
|
* E.g. KVM doesn't support LBR MSR
|
|
|
|
* Check all LBT MSR here.
|
|
|
|
* Disable LBR access if any LBR MSRs can not be accessed.
|
|
|
|
*/
|
|
|
|
if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
|
|
|
|
x86_pmu.lbr_nr = 0;
|
|
|
|
for (i = 0; i < x86_pmu.lbr_nr; i++) {
|
|
|
|
if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
|
|
|
|
check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
|
|
|
|
x86_pmu.lbr_nr = 0;
|
|
|
|
}
|
|
|
|
|
2019-05-12 23:55:15 +08:00
|
|
|
if (x86_pmu.lbr_nr)
|
2016-06-22 02:31:10 +08:00
|
|
|
pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
|
2017-08-23 02:52:01 +08:00
|
|
|
|
2014-07-15 03:25:56 +08:00
|
|
|
/*
|
|
|
|
* Access extra MSR may cause #GP under certain circumstances.
|
|
|
|
* E.g. KVM doesn't support offcore event
|
|
|
|
* Check all extra_regs here.
|
|
|
|
*/
|
|
|
|
if (x86_pmu.extra_regs) {
|
|
|
|
for (er = x86_pmu.extra_regs; er->msr; er++) {
|
2015-07-01 07:33:24 +08:00
|
|
|
er->extra_msr_access = check_msr(er->msr, 0x11UL);
|
2014-07-15 03:25:56 +08:00
|
|
|
/* Disable LBR select mapping */
|
|
|
|
if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
|
|
|
|
x86_pmu.lbr_sel_map = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-06-25 23:12:33 +08:00
|
|
|
/* Support full width counters using alternative MSR range */
|
|
|
|
if (x86_pmu.intel_cap.full_width_write) {
|
2016-11-30 04:33:28 +08:00
|
|
|
x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
|
2013-06-25 23:12:33 +08:00
|
|
|
x86_pmu.perfctr = MSR_IA32_PMC0;
|
|
|
|
pr_cont("full-width counters, ");
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Hybrid PMU support for perf capabilities
Some platforms, e.g. Alder Lake, have hybrid architecture. Although most
PMU capabilities are the same, there are still some unique PMU
capabilities for different hybrid PMUs. Perf should register a dedicated
pmu for each hybrid PMU.
Add a new struct x86_hybrid_pmu, which saves the dedicated pmu and
capabilities for each hybrid PMU.
The architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicates the
architecture features which are available on all hybrid PMUs. The
architecture features are stored in the global x86_pmu.intel_cap.
For Alder Lake, the model-specific features are perf metrics and
PEBS-via-PT. The corresponding bits of the global x86_pmu.intel_cap
should be 0 for these two features. Perf should not use the global
intel_cap to check the features on a hybrid system.
Add a dedicated intel_cap in the x86_hybrid_pmu to store the
model-specific capabilities. Use the dedicated intel_cap to replace
the global intel_cap for thse two features. The dedicated intel_cap
will be set in the following "Add Alder Lake Hybrid support" patch.
Add is_hybrid() to distinguish a hybrid system. ADL may have an
alternative configuration. With that configuration, the
X86_FEATURE_HYBRID_CPU is not set. Perf cannot rely on the feature bit.
Add a new static_key_false, perf_is_hybrid, to indicate a hybrid system.
It will be assigned in the following "Add Alder Lake Hybrid support"
patch as well.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-5-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:44 +08:00
|
|
|
if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics)
|
2020-07-24 01:11:13 +08:00
|
|
|
x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2014-11-18 03:07:04 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* HT bug: phase 2 init
|
|
|
|
* Called once we have valid topology information to check
|
|
|
|
* whether or not HT is enabled
|
|
|
|
* If HT is off, then we disable the workaround
|
|
|
|
*/
|
|
|
|
static __init int fixup_ht_bug(void)
|
|
|
|
{
|
2016-05-20 08:09:59 +08:00
|
|
|
int c;
|
2014-11-18 03:07:04 +08:00
|
|
|
/*
|
|
|
|
* problem not present on this CPU model, nothing to do
|
|
|
|
*/
|
|
|
|
if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
|
|
|
|
return 0;
|
|
|
|
|
2016-05-20 08:09:59 +08:00
|
|
|
if (topology_max_smt_threads() > 1) {
|
2014-11-18 03:07:04 +08:00
|
|
|
pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-09-13 03:36:56 +08:00
|
|
|
cpus_read_lock();
|
|
|
|
|
|
|
|
hardlockup_detector_perf_stop();
|
2014-11-18 03:07:04 +08:00
|
|
|
|
|
|
|
x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
|
|
|
|
|
|
|
|
x86_pmu.start_scheduling = NULL;
|
2015-05-21 16:57:32 +08:00
|
|
|
x86_pmu.commit_scheduling = NULL;
|
2014-11-18 03:07:04 +08:00
|
|
|
x86_pmu.stop_scheduling = NULL;
|
|
|
|
|
2017-09-13 03:36:56 +08:00
|
|
|
hardlockup_detector_perf_restart();
|
2014-11-18 03:07:04 +08:00
|
|
|
|
2017-05-24 16:15:30 +08:00
|
|
|
for_each_online_cpu(c)
|
2019-03-06 05:23:15 +08:00
|
|
|
free_excl_cntrs(&per_cpu(cpu_hw_events, c));
|
2014-11-18 03:07:04 +08:00
|
|
|
|
2017-05-24 16:15:30 +08:00
|
|
|
cpus_read_unlock();
|
2014-11-18 03:07:04 +08:00
|
|
|
pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
subsys_initcall(fixup_ht_bug)
|