2019-05-19 20:08:55 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2011-03-03 10:34:47 +08:00
|
|
|
/*
|
2011-06-06 22:57:03 +08:00
|
|
|
* Per core/cpu state
|
|
|
|
*
|
|
|
|
* Used to coordinate shared registers between HT threads or
|
|
|
|
* among events on a single PMU.
|
2011-03-03 10:34:47 +08:00
|
|
|
*/
|
2011-08-31 07:41:05 +08:00
|
|
|
|
2012-05-22 10:50:07 +08:00
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
#include <linux/stddef.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/slab.h>
|
2011-05-27 00:22:53 +08:00
|
|
|
#include <linux/export.h>
|
2015-09-05 06:45:12 +08:00
|
|
|
#include <linux/nmi.h>
|
2011-08-31 07:41:05 +08:00
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
#include <asm/cpufeature.h>
|
2011-08-31 07:41:05 +08:00
|
|
|
#include <asm/hardirq.h>
|
2016-06-03 08:19:29 +08:00
|
|
|
#include <asm/intel-family.h>
|
perf/x86/intel: Support PEBS output to PT
If PEBS declares ability to output its data to Intel PT stream, use the
aux_output attribute bit to enable PEBS data output to PT. This requires
a PT event to be present and scheduled in the same context. Unlike the
DS area, the kernel does not extract PEBS records from the PT stream to
generate corresponding records in the perf stream, because that would
require real time in-kernel PT decoding, which is not feasible. The PMI,
however, can still be used.
The output setting is per-CPU, so all PEBS events must be either writing
to PT or to the DS area, therefore, in case of conflict, the conflicting
event will fail to schedule, allowing the rotation logic to alternate
between the PEBS->PT and PEBS->DS events.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: kan.liang@linux.intel.com
Link: https://lkml.kernel.org/r/20190806084606.4021-3-alexander.shishkin@linux.intel.com
2019-08-06 16:46:01 +08:00
|
|
|
#include <asm/intel_pt.h>
|
2011-08-31 07:41:05 +08:00
|
|
|
#include <asm/apic.h>
|
2019-02-05 06:23:30 +08:00
|
|
|
#include <asm/cpu_device_id.h>
|
2011-08-31 07:41:05 +08:00
|
|
|
|
2016-02-10 17:55:23 +08:00
|
|
|
#include "../perf_event.h"
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
2010-02-01 22:36:30 +08:00
|
|
|
* Intel PerfMon, used on Core and later.
|
2010-02-26 19:05:05 +08:00
|
|
|
*/
|
2011-04-27 17:51:41 +08:00
|
|
|
static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2012-07-06 17:59:46 +08:00
|
|
|
[PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
|
|
|
|
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
|
|
|
|
[PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
|
|
|
|
[PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
|
|
|
|
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
|
|
|
|
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
|
|
|
|
[PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
|
|
|
|
[PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
|
2010-02-26 19:05:05 +08:00
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_core_event_constraints[] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_core2_event_constraints[] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-02-01 22:36:30 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
2011-12-11 07:28:51 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2010-02-26 19:05:05 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
|
2010-02-01 22:36:30 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
|
2010-02-26 19:05:05 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-02-01 22:36:30 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
2011-12-11 07:28:51 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2010-02-26 19:05:05 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
|
2011-03-03 10:34:47 +08:00
|
|
|
{
|
2013-07-18 17:02:23 +08:00
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
|
2013-01-24 23:10:32 +08:00
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
|
2011-03-03 10:34:47 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-02-01 22:36:30 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
2011-12-11 07:28:51 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2010-02-26 19:05:05 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
|
2010-06-10 19:25:01 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
|
2010-02-26 19:05:05 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_snb_event_constraints[] __read_mostly =
|
2011-03-02 21:27:04 +08:00
|
|
|
{
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
2011-12-11 07:28:51 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2013-03-17 21:49:57 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
|
2011-03-02 21:27:04 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
|
2013-03-09 07:22:48 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
|
2014-11-18 03:06:59 +08:00
|
|
|
|
2016-07-02 06:22:22 +08:00
|
|
|
/*
|
|
|
|
* When HT is off these events can only run on the bottom 4 counters
|
|
|
|
* When HT is on, they are impacted by the HT bug and require EXCL access
|
|
|
|
*/
|
2014-11-18 03:06:59 +08:00
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
|
|
|
|
|
2011-03-02 21:27:04 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2013-02-20 18:15:12 +08:00
|
|
|
static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
|
|
|
|
{
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
|
2013-09-11 23:22:22 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
|
2013-02-20 18:15:12 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
|
2014-11-18 03:06:59 +08:00
|
|
|
|
2016-07-02 06:22:22 +08:00
|
|
|
/*
|
|
|
|
* When HT is off these events can only run on the bottom 4 counters
|
|
|
|
* When HT is on, they are impacted by the HT bug and require EXCL access
|
|
|
|
*/
|
2014-11-18 03:06:59 +08:00
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
|
|
|
|
|
2013-02-20 18:15:12 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
|
2011-03-03 10:34:47 +08:00
|
|
|
{
|
2013-07-18 17:02:23 +08:00
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
|
2013-01-24 23:10:32 +08:00
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
|
2011-03-03 10:34:47 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2011-06-29 23:42:36 +08:00
|
|
|
static struct event_constraint intel_v1_event_constraints[] __read_mostly =
|
|
|
|
{
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_gen_event_constraints[] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-02-01 22:36:30 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
2011-12-11 07:28:51 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2010-02-26 19:05:05 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2013-07-18 17:02:24 +08:00
|
|
|
static struct event_constraint intel_slm_event_constraints[] __read_mostly =
|
|
|
|
{
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2016-05-17 05:16:18 +08:00
|
|
|
static struct event_constraint intel_skl_event_constraints[] = {
|
2015-05-11 03:22:44 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
|
2016-07-02 06:22:22 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* when HT is off, these can only run on the bottom 4 counters
|
|
|
|
*/
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xcd, 0xf), /* MEM_TRANS_RETIRED.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xc6, 0xf), /* FRONTEND_RETIRED.* */
|
|
|
|
|
2015-05-11 03:22:44 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2015-12-08 06:28:18 +08:00
|
|
|
static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
|
perf/x86/intel: Change offcore response masks for Knights Landing
Due to change in register definition we need to update OCR mask:
MSR_OFFCORE_RESP0 reserved bits: 3,4,18,29,30,33,34, 8,11,14
MSR_OFFCORE_RESP1 reserved bits: 3,4,18,29,30,33,34, 38
Reported-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Lukasz Odzioba <lukasz.odzioba@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: akpm@linux-foundation.org
Cc: hpa@zytor.com
Cc: kan.liang@intel.com
Cc: lukasz.anaczkowski@intel.com
Cc: zheng.z.yan@intel.com
Link: http://lkml.kernel.org/r/1463433419-16893-1-git-send-email-lukasz.odzioba@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-05-17 05:16:59 +08:00
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x799ffbb6e7ull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x399ffbffe7ull, RSP_1),
|
2015-12-08 06:28:18 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2011-06-06 22:57:12 +08:00
|
|
|
static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
|
2013-07-18 17:02:23 +08:00
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
|
2013-01-24 23:10:32 +08:00
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
|
2013-04-16 19:51:43 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
|
2013-07-18 17:02:23 +08:00
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
|
2013-06-08 05:22:10 +08:00
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
|
2011-06-06 22:57:12 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2015-05-11 03:22:44 +08:00
|
|
|
static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
|
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
|
2015-09-10 05:54:00 +08:00
|
|
|
/*
|
|
|
|
* Note the low 8 bits eventsel code is not a continuous field, containing
|
|
|
|
* some #GPing bits. These are masked out.
|
|
|
|
*/
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
|
2015-05-11 03:22:44 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2019-04-03 03:45:05 +08:00
|
|
|
static struct event_constraint intel_icl_event_constraints[] = {
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x1c0, 0), /* INST_RETIRED.PREC_DIST */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0x03, 0x0a, 0xf),
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0x1f, 0x28, 0xf),
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x32, 0xf), /* SW_PREFETCH_ACCESS.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf),
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf),
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff), /* CYCLE_ACTIVITY.STALLS_TOTAL */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xa3, 0xf), /* CYCLE_ACTIVITY.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf),
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf),
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xe6, 0xf),
|
|
|
|
INTEL_EVENT_CONSTRAINT_RANGE(0xf0, 0xf4, 0xf),
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct extra_reg intel_icl_extra_regs[] __read_mostly = {
|
perf/x86/intel: Fix invalid Bit 13 for Icelake MSR_OFFCORE_RSP_x register
The Intel SDM states that bit 13 of Icelake's MSR_OFFCORE_RSP_x
register is valid, and used for counting hardware generated prefetches
of L3 cache. Update the bitmask to allow bit 13.
Before:
$ perf stat -e cpu/event=0xb7,umask=0x1,config1=0x1bfff/u sleep 3
Performance counter stats for 'sleep 3':
<not supported> cpu/event=0xb7,umask=0x1,config1=0x1bfff/u
After:
$ perf stat -e cpu/event=0xb7,umask=0x1,config1=0x1bfff/u sleep 3
Performance counter stats for 'sleep 3':
9,293 cpu/event=0xb7,umask=0x1,config1=0x1bfff/u
Signed-off-by: Yunying Sun <yunying.sun@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@kernel.org
Cc: alexander.shishkin@linux.intel.com
Cc: bp@alien8.de
Cc: hpa@zytor.com
Cc: jolsa@redhat.com
Cc: namhyung@kernel.org
Link: https://lkml.kernel.org/r/20190724082932.12833-1-yunying.sun@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-07-24 16:29:32 +08:00
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffffbfffull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffffbfffull, RSP_1),
|
2019-04-03 03:45:05 +08:00
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
|
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2013-09-13 01:17:00 +08:00
|
|
|
EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
|
|
|
|
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
|
|
|
|
EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
|
2013-01-24 23:10:32 +08:00
|
|
|
|
2018-09-06 21:57:48 +08:00
|
|
|
static struct attribute *nhm_mem_events_attrs[] = {
|
2013-01-24 23:10:32 +08:00
|
|
|
EVENT_PTR(mem_ld_nhm),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2016-05-20 08:09:57 +08:00
|
|
|
/*
|
|
|
|
* topdown events for Intel Core CPUs.
|
|
|
|
*
|
|
|
|
* The events are all in slots, which is a free slot in a 4 wide
|
|
|
|
* pipeline. Some events are already reported in slots, for cycle
|
|
|
|
* events we multiply by the pipeline width (4).
|
|
|
|
*
|
|
|
|
* With Hyper Threading on, topdown metrics are either summed or averaged
|
|
|
|
* between the threads of a core: (count_t0 + count_t1).
|
|
|
|
*
|
|
|
|
* For the average case the metric is always scaled to pipeline width,
|
|
|
|
* so we use factor 2 ((count_t0 + count_t1) / 2 * 4)
|
|
|
|
*/
|
|
|
|
|
|
|
|
EVENT_ATTR_STR_HT(topdown-total-slots, td_total_slots,
|
|
|
|
"event=0x3c,umask=0x0", /* cpu_clk_unhalted.thread */
|
|
|
|
"event=0x3c,umask=0x0,any=1"); /* cpu_clk_unhalted.thread_any */
|
|
|
|
EVENT_ATTR_STR_HT(topdown-total-slots.scale, td_total_slots_scale, "4", "2");
|
|
|
|
EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued,
|
|
|
|
"event=0xe,umask=0x1"); /* uops_issued.any */
|
|
|
|
EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired,
|
|
|
|
"event=0xc2,umask=0x2"); /* uops_retired.retire_slots */
|
|
|
|
EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles,
|
|
|
|
"event=0x9c,umask=0x1"); /* idq_uops_not_delivered_core */
|
|
|
|
EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles,
|
|
|
|
"event=0xd,umask=0x3,cmask=1", /* int_misc.recovery_cycles */
|
|
|
|
"event=0xd,umask=0x3,cmask=1,any=1"); /* int_misc.recovery_cycles_any */
|
|
|
|
EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale,
|
|
|
|
"4", "2");
|
|
|
|
|
2016-05-17 05:16:18 +08:00
|
|
|
static struct attribute *snb_events_attrs[] = {
|
2016-05-20 08:09:57 +08:00
|
|
|
EVENT_PTR(td_slots_issued),
|
|
|
|
EVENT_PTR(td_slots_retired),
|
|
|
|
EVENT_PTR(td_fetch_bubbles),
|
|
|
|
EVENT_PTR(td_total_slots),
|
|
|
|
EVENT_PTR(td_total_slots_scale),
|
|
|
|
EVENT_PTR(td_recovery_bubbles),
|
|
|
|
EVENT_PTR(td_recovery_bubbles_scale),
|
2013-01-24 23:10:32 +08:00
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2018-09-06 21:57:48 +08:00
|
|
|
static struct attribute *snb_mem_events_attrs[] = {
|
|
|
|
EVENT_PTR(mem_ld_snb),
|
|
|
|
EVENT_PTR(mem_st_snb),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
static struct event_constraint intel_hsw_event_constraints[] = {
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2015-11-25 01:05:01 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
|
2013-06-18 08:36:48 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
|
|
|
|
/* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
|
2015-03-10 02:20:22 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
|
2013-06-18 08:36:48 +08:00
|
|
|
/* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
|
2015-03-10 02:20:22 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
|
2013-06-18 08:36:48 +08:00
|
|
|
/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
|
2015-03-10 02:20:22 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
|
2014-11-18 03:06:59 +08:00
|
|
|
|
2016-07-02 06:22:22 +08:00
|
|
|
/*
|
|
|
|
* When HT is off these events can only run on the bottom 4 counters
|
|
|
|
* When HT is on, they are impacted by the HT bug and require EXCL access
|
|
|
|
*/
|
2014-11-18 03:06:59 +08:00
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
|
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2016-05-17 05:16:18 +08:00
|
|
|
static struct event_constraint intel_bdw_event_constraints[] = {
|
2015-02-18 10:18:05 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
|
2015-11-17 08:21:07 +08:00
|
|
|
INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
|
2016-07-02 06:22:22 +08:00
|
|
|
/*
|
|
|
|
* when HT is off, these can only run on the bottom 4 counters
|
|
|
|
*/
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xcd, 0xf), /* MEM_TRANS_RETIRED.* */
|
2015-02-18 10:18:05 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
static u64 intel_pmu_event_map(int hw_event)
|
|
|
|
{
|
|
|
|
return intel_perfmon_event_map[hw_event];
|
|
|
|
}
|
|
|
|
|
2015-05-11 03:22:44 +08:00
|
|
|
/*
|
|
|
|
* Notes on the events:
|
|
|
|
* - data reads do not include code reads (comparable to earlier tables)
|
|
|
|
* - data counts include speculative execution (except L1 write, dtlb, bpu)
|
|
|
|
* - remote node access includes remote memory, remote cache, remote mmio.
|
|
|
|
* - prefetches are not included in the counts.
|
|
|
|
* - icache miss does not include decoded icache
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define SKL_DEMAND_DATA_RD BIT_ULL(0)
|
|
|
|
#define SKL_DEMAND_RFO BIT_ULL(1)
|
|
|
|
#define SKL_ANY_RESPONSE BIT_ULL(16)
|
|
|
|
#define SKL_SUPPLIER_NONE BIT_ULL(17)
|
|
|
|
#define SKL_L3_MISS_LOCAL_DRAM BIT_ULL(26)
|
|
|
|
#define SKL_L3_MISS_REMOTE_HOP0_DRAM BIT_ULL(27)
|
|
|
|
#define SKL_L3_MISS_REMOTE_HOP1_DRAM BIT_ULL(28)
|
|
|
|
#define SKL_L3_MISS_REMOTE_HOP2P_DRAM BIT_ULL(29)
|
|
|
|
#define SKL_L3_MISS (SKL_L3_MISS_LOCAL_DRAM| \
|
|
|
|
SKL_L3_MISS_REMOTE_HOP0_DRAM| \
|
|
|
|
SKL_L3_MISS_REMOTE_HOP1_DRAM| \
|
|
|
|
SKL_L3_MISS_REMOTE_HOP2P_DRAM)
|
|
|
|
#define SKL_SPL_HIT BIT_ULL(30)
|
|
|
|
#define SKL_SNOOP_NONE BIT_ULL(31)
|
|
|
|
#define SKL_SNOOP_NOT_NEEDED BIT_ULL(32)
|
|
|
|
#define SKL_SNOOP_MISS BIT_ULL(33)
|
|
|
|
#define SKL_SNOOP_HIT_NO_FWD BIT_ULL(34)
|
|
|
|
#define SKL_SNOOP_HIT_WITH_FWD BIT_ULL(35)
|
|
|
|
#define SKL_SNOOP_HITM BIT_ULL(36)
|
|
|
|
#define SKL_SNOOP_NON_DRAM BIT_ULL(37)
|
|
|
|
#define SKL_ANY_SNOOP (SKL_SPL_HIT|SKL_SNOOP_NONE| \
|
|
|
|
SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
|
|
|
|
SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
|
|
|
|
SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
|
|
|
|
#define SKL_DEMAND_READ SKL_DEMAND_DATA_RD
|
|
|
|
#define SKL_SNOOP_DRAM (SKL_SNOOP_NONE| \
|
|
|
|
SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
|
|
|
|
SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
|
|
|
|
SKL_SNOOP_HITM|SKL_SPL_HIT)
|
|
|
|
#define SKL_DEMAND_WRITE SKL_DEMAND_RFO
|
|
|
|
#define SKL_LLC_ACCESS SKL_ANY_RESPONSE
|
|
|
|
#define SKL_L3_MISS_REMOTE (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
|
|
|
|
SKL_L3_MISS_REMOTE_HOP1_DRAM| \
|
|
|
|
SKL_L3_MISS_REMOTE_HOP2P_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 skl_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x283, /* ICACHE_64B.MISS */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
|
2017-06-19 22:26:09 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0xe08, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
|
2015-05-11 03:22:44 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
|
2017-06-19 22:26:09 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0xe49, /* DTLB_STORE_MISSES.WALK_COMPLETED */
|
2015-05-11 03:22:44 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x2085, /* ITLB_MISSES.STLB_HIT */
|
|
|
|
[ C(RESULT_MISS) ] = 0xe85, /* ITLB_MISSES.WALK_COMPLETED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static __initconst const u64 skl_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
|
|
|
|
SKL_LLC_ACCESS|SKL_ANY_SNOOP,
|
|
|
|
[ C(RESULT_MISS) ] = SKL_DEMAND_READ|
|
|
|
|
SKL_L3_MISS|SKL_ANY_SNOOP|
|
|
|
|
SKL_SUPPLIER_NONE,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
|
|
|
|
SKL_LLC_ACCESS|SKL_ANY_SNOOP,
|
|
|
|
[ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
|
|
|
|
SKL_L3_MISS|SKL_ANY_SNOOP|
|
|
|
|
SKL_SUPPLIER_NONE,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
|
|
|
|
SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
|
|
|
|
[ C(RESULT_MISS) ] = SKL_DEMAND_READ|
|
|
|
|
SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
|
|
|
|
SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
|
|
|
|
[ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
|
|
|
|
SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2012-07-17 17:27:55 +08:00
|
|
|
#define SNB_DMND_DATA_RD (1ULL << 0)
|
|
|
|
#define SNB_DMND_RFO (1ULL << 1)
|
|
|
|
#define SNB_DMND_IFETCH (1ULL << 2)
|
|
|
|
#define SNB_DMND_WB (1ULL << 3)
|
|
|
|
#define SNB_PF_DATA_RD (1ULL << 4)
|
|
|
|
#define SNB_PF_RFO (1ULL << 5)
|
|
|
|
#define SNB_PF_IFETCH (1ULL << 6)
|
|
|
|
#define SNB_LLC_DATA_RD (1ULL << 7)
|
|
|
|
#define SNB_LLC_RFO (1ULL << 8)
|
|
|
|
#define SNB_LLC_IFETCH (1ULL << 9)
|
|
|
|
#define SNB_BUS_LOCKS (1ULL << 10)
|
|
|
|
#define SNB_STRM_ST (1ULL << 11)
|
|
|
|
#define SNB_OTHER (1ULL << 15)
|
|
|
|
#define SNB_RESP_ANY (1ULL << 16)
|
|
|
|
#define SNB_NO_SUPP (1ULL << 17)
|
|
|
|
#define SNB_LLC_HITM (1ULL << 18)
|
|
|
|
#define SNB_LLC_HITE (1ULL << 19)
|
|
|
|
#define SNB_LLC_HITS (1ULL << 20)
|
|
|
|
#define SNB_LLC_HITF (1ULL << 21)
|
|
|
|
#define SNB_LOCAL (1ULL << 22)
|
|
|
|
#define SNB_REMOTE (0xffULL << 23)
|
|
|
|
#define SNB_SNP_NONE (1ULL << 31)
|
|
|
|
#define SNB_SNP_NOT_NEEDED (1ULL << 32)
|
|
|
|
#define SNB_SNP_MISS (1ULL << 33)
|
|
|
|
#define SNB_NO_FWD (1ULL << 34)
|
|
|
|
#define SNB_SNP_FWD (1ULL << 35)
|
|
|
|
#define SNB_HITM (1ULL << 36)
|
|
|
|
#define SNB_NON_DRAM (1ULL << 37)
|
|
|
|
|
|
|
|
#define SNB_DMND_READ (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
|
|
|
|
#define SNB_DMND_WRITE (SNB_DMND_RFO|SNB_LLC_RFO)
|
|
|
|
#define SNB_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
|
|
|
|
|
|
|
|
#define SNB_SNP_ANY (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
|
|
|
|
SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
|
|
|
|
SNB_HITM)
|
|
|
|
|
|
|
|
#define SNB_DRAM_ANY (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
|
|
|
|
#define SNB_DRAM_REMOTE (SNB_REMOTE|SNB_SNP_ANY)
|
|
|
|
|
|
|
|
#define SNB_L3_ACCESS SNB_RESP_ANY
|
|
|
|
#define SNB_L3_MISS (SNB_DRAM_ANY|SNB_NON_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 snb_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_L3_MISS,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_L3_MISS,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2011-03-02 21:27:04 +08:00
|
|
|
static __initconst const u64 snb_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
|
2011-03-02 21:27:04 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-03-02 21:27:04 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
|
2011-03-02 21:27:04 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-03-02 21:27:04 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
|
2011-03-02 21:27:04 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-03-02 21:27:04 +08:00
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
2011-04-23 05:37:06 +08:00
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
2012-07-17 17:27:55 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
2012-07-17 17:27:55 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
2012-07-17 17:27:55 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
},
|
|
|
|
|
2011-03-02 21:27:04 +08:00
|
|
|
};
|
|
|
|
|
2015-02-18 10:18:04 +08:00
|
|
|
/*
|
|
|
|
* Notes on the events:
|
|
|
|
* - data reads do not include code reads (comparable to earlier tables)
|
|
|
|
* - data counts include speculative execution (except L1 write, dtlb, bpu)
|
|
|
|
* - remote node access includes remote memory, remote cache, remote mmio.
|
|
|
|
* - prefetches are not included in the counts because they are not
|
|
|
|
* reliably counted.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define HSW_DEMAND_DATA_RD BIT_ULL(0)
|
|
|
|
#define HSW_DEMAND_RFO BIT_ULL(1)
|
|
|
|
#define HSW_ANY_RESPONSE BIT_ULL(16)
|
|
|
|
#define HSW_SUPPLIER_NONE BIT_ULL(17)
|
|
|
|
#define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22)
|
|
|
|
#define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27)
|
|
|
|
#define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28)
|
|
|
|
#define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29)
|
|
|
|
#define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \
|
|
|
|
HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
|
|
|
|
HSW_L3_MISS_REMOTE_HOP2P)
|
|
|
|
#define HSW_SNOOP_NONE BIT_ULL(31)
|
|
|
|
#define HSW_SNOOP_NOT_NEEDED BIT_ULL(32)
|
|
|
|
#define HSW_SNOOP_MISS BIT_ULL(33)
|
|
|
|
#define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34)
|
|
|
|
#define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35)
|
|
|
|
#define HSW_SNOOP_HITM BIT_ULL(36)
|
|
|
|
#define HSW_SNOOP_NON_DRAM BIT_ULL(37)
|
|
|
|
#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \
|
|
|
|
HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
|
|
|
|
HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
|
|
|
|
HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
|
|
|
|
#define HSW_SNOOP_DRAM (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
|
|
|
|
#define HSW_DEMAND_READ HSW_DEMAND_DATA_RD
|
|
|
|
#define HSW_DEMAND_WRITE HSW_DEMAND_RFO
|
|
|
|
#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\
|
|
|
|
HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
|
|
|
|
#define HSW_LLC_ACCESS HSW_ANY_RESPONSE
|
|
|
|
|
2015-02-18 10:18:05 +08:00
|
|
|
#define BDW_L3_MISS_LOCAL BIT(26)
|
|
|
|
#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \
|
|
|
|
HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
|
|
|
|
HSW_L3_MISS_REMOTE_HOP2P)
|
|
|
|
|
|
|
|
|
2015-02-18 10:18:04 +08:00
|
|
|
static __initconst const u64 hsw_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */
|
|
|
|
[ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static __initconst const u64 hsw_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
|
|
|
|
HSW_LLC_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = HSW_DEMAND_READ|
|
|
|
|
HSW_L3_MISS|HSW_ANY_SNOOP,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
|
|
|
|
HSW_LLC_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
|
|
|
|
HSW_L3_MISS|HSW_ANY_SNOOP,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
|
|
|
|
HSW_L3_MISS_LOCAL_DRAM|
|
|
|
|
HSW_SNOOP_DRAM,
|
|
|
|
[ C(RESULT_MISS) ] = HSW_DEMAND_READ|
|
|
|
|
HSW_L3_MISS_REMOTE|
|
|
|
|
HSW_SNOOP_DRAM,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
|
|
|
|
HSW_L3_MISS_LOCAL_DRAM|
|
|
|
|
HSW_SNOOP_DRAM,
|
|
|
|
[ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
|
|
|
|
HSW_L3_MISS_REMOTE|
|
|
|
|
HSW_SNOOP_DRAM,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2010-03-29 19:09:53 +08:00
|
|
|
static __initconst const u64 westmere_hw_cache_event_ids
|
2010-02-26 19:05:05 +08:00
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
|
2011-03-03 10:34:48 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
2011-03-03 10:34:48 +08:00
|
|
|
/*
|
|
|
|
* Use RFO, not WRITEBACK, because a write miss would typically occur
|
|
|
|
* on RFO.
|
|
|
|
*/
|
2010-02-26 19:05:05 +08:00
|
|
|
[ C(OP_WRITE) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
|
2011-03-03 10:34:48 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
|
2011-03-03 10:34:48 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
2011-04-23 05:37:06 +08:00
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
},
|
2010-02-26 19:05:05 +08:00
|
|
|
};
|
|
|
|
|
2011-03-03 10:34:48 +08:00
|
|
|
/*
|
2011-04-23 06:57:42 +08:00
|
|
|
* Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
|
|
|
|
* See IA32 SDM Vol 3B 30.6.1.3
|
2011-03-03 10:34:48 +08:00
|
|
|
*/
|
|
|
|
|
2011-04-23 06:57:42 +08:00
|
|
|
#define NHM_DMND_DATA_RD (1 << 0)
|
|
|
|
#define NHM_DMND_RFO (1 << 1)
|
|
|
|
#define NHM_DMND_IFETCH (1 << 2)
|
|
|
|
#define NHM_DMND_WB (1 << 3)
|
|
|
|
#define NHM_PF_DATA_RD (1 << 4)
|
|
|
|
#define NHM_PF_DATA_RFO (1 << 5)
|
|
|
|
#define NHM_PF_IFETCH (1 << 6)
|
|
|
|
#define NHM_OFFCORE_OTHER (1 << 7)
|
|
|
|
#define NHM_UNCORE_HIT (1 << 8)
|
|
|
|
#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
|
|
|
|
#define NHM_OTHER_CORE_HITM (1 << 10)
|
|
|
|
/* reserved */
|
|
|
|
#define NHM_REMOTE_CACHE_FWD (1 << 12)
|
|
|
|
#define NHM_REMOTE_DRAM (1 << 13)
|
|
|
|
#define NHM_LOCAL_DRAM (1 << 14)
|
|
|
|
#define NHM_NON_DRAM (1 << 15)
|
|
|
|
|
perf/x86: Fix local vs remote memory events for NHM/WSM
Verified using the below proglet.. before:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,101,554 node-stores
2,096,931 node-store-misses
5.021546079 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
501,137 node-stores
199 node-store-misses
5.124451068 seconds time elapsed
After:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,107,516 node-stores
2,097,187 node-store-misses
5.012755149 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
2,063,355 node-stores
165 node-store-misses
5.082091494 seconds time elapsed
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>
#define SIZE (32*1024*1024)
volatile int done;
void sig_done(int sig)
{
done = 1;
}
int main(int argc, char **argv)
{
cpu_set_t *mask, *mask2;
size_t size;
int i, err, t;
int nrcpus = 1024;
char *mem;
unsigned long nodemask = 0x01; /* node 0 */
DIR *node;
struct dirent *de;
int read = 0;
int local = 0;
if (argc < 2) {
printf("usage: %s [0-3]\n", argv[0]);
printf(" bit0 - local/remote\n");
printf(" bit1 - read/write\n");
exit(0);
}
switch (atoi(argv[1])) {
case 0:
printf("remote write\n");
break;
case 1:
printf("local write\n");
local = 1;
break;
case 2:
printf("remote read\n");
read = 1;
break;
case 3:
printf("local read\n");
local = 1;
read = 1;
break;
}
mask = CPU_ALLOC(nrcpus);
size = CPU_ALLOC_SIZE(nrcpus);
CPU_ZERO_S(size, mask);
node = opendir("/sys/devices/system/node/node0/");
if (!node)
perror("opendir");
while ((de = readdir(node))) {
int cpu;
if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
CPU_SET_S(cpu, size, mask);
}
closedir(node);
mask2 = CPU_ALLOC(nrcpus);
CPU_ZERO_S(size, mask2);
for (i = 0; i < size; i++)
CPU_SET_S(i, size, mask2);
CPU_XOR_S(size, mask2, mask2, mask); // invert
if (!local)
mask = mask2;
err = sched_setaffinity(0, size, mask);
if (err)
perror("sched_setaffinity");
mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
if (err)
perror("mbind");
signal(SIGALRM, sig_done);
alarm(5);
if (!read) {
while (!done) {
for (i = 0; i < SIZE; i++)
mem[i] = 0x01;
}
} else {
while (!done) {
for (i = 0; i < SIZE; i++)
t += *(volatile char *)(mem + i);
}
}
return 0;
}
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-06 06:59:25 +08:00
|
|
|
#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
|
|
|
|
#define NHM_REMOTE (NHM_REMOTE_DRAM)
|
2011-04-23 06:57:42 +08:00
|
|
|
|
|
|
|
#define NHM_DMND_READ (NHM_DMND_DATA_RD)
|
|
|
|
#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
|
|
|
|
#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
|
|
|
|
|
|
|
|
#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
|
perf/x86: Fix local vs remote memory events for NHM/WSM
Verified using the below proglet.. before:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,101,554 node-stores
2,096,931 node-store-misses
5.021546079 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
501,137 node-stores
199 node-store-misses
5.124451068 seconds time elapsed
After:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,107,516 node-stores
2,097,187 node-store-misses
5.012755149 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
2,063,355 node-stores
165 node-store-misses
5.082091494 seconds time elapsed
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>
#define SIZE (32*1024*1024)
volatile int done;
void sig_done(int sig)
{
done = 1;
}
int main(int argc, char **argv)
{
cpu_set_t *mask, *mask2;
size_t size;
int i, err, t;
int nrcpus = 1024;
char *mem;
unsigned long nodemask = 0x01; /* node 0 */
DIR *node;
struct dirent *de;
int read = 0;
int local = 0;
if (argc < 2) {
printf("usage: %s [0-3]\n", argv[0]);
printf(" bit0 - local/remote\n");
printf(" bit1 - read/write\n");
exit(0);
}
switch (atoi(argv[1])) {
case 0:
printf("remote write\n");
break;
case 1:
printf("local write\n");
local = 1;
break;
case 2:
printf("remote read\n");
read = 1;
break;
case 3:
printf("local read\n");
local = 1;
read = 1;
break;
}
mask = CPU_ALLOC(nrcpus);
size = CPU_ALLOC_SIZE(nrcpus);
CPU_ZERO_S(size, mask);
node = opendir("/sys/devices/system/node/node0/");
if (!node)
perror("opendir");
while ((de = readdir(node))) {
int cpu;
if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
CPU_SET_S(cpu, size, mask);
}
closedir(node);
mask2 = CPU_ALLOC(nrcpus);
CPU_ZERO_S(size, mask2);
for (i = 0; i < size; i++)
CPU_SET_S(i, size, mask2);
CPU_XOR_S(size, mask2, mask2, mask); // invert
if (!local)
mask = mask2;
err = sched_setaffinity(0, size, mask);
if (err)
perror("sched_setaffinity");
mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
if (err)
perror("mbind");
signal(SIGALRM, sig_done);
alarm(5);
if (!read) {
while (!done) {
for (i = 0; i < SIZE; i++)
mem[i] = 0x01;
}
} else {
while (!done) {
for (i = 0; i < SIZE; i++)
t += *(volatile char *)(mem + i);
}
}
return 0;
}
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-06 06:59:25 +08:00
|
|
|
#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
|
2011-04-23 06:57:42 +08:00
|
|
|
#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
|
2011-03-03 10:34:48 +08:00
|
|
|
|
|
|
|
static __initconst const u64 nehalem_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS,
|
2011-03-03 10:34:48 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS,
|
2011-03-03 10:34:48 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
|
2011-03-03 10:34:48 +08:00
|
|
|
},
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
perf/x86: Fix local vs remote memory events for NHM/WSM
Verified using the below proglet.. before:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,101,554 node-stores
2,096,931 node-store-misses
5.021546079 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
501,137 node-stores
199 node-store-misses
5.124451068 seconds time elapsed
After:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,107,516 node-stores
2,097,187 node-store-misses
5.012755149 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
2,063,355 node-stores
165 node-store-misses
5.082091494 seconds time elapsed
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>
#define SIZE (32*1024*1024)
volatile int done;
void sig_done(int sig)
{
done = 1;
}
int main(int argc, char **argv)
{
cpu_set_t *mask, *mask2;
size_t size;
int i, err, t;
int nrcpus = 1024;
char *mem;
unsigned long nodemask = 0x01; /* node 0 */
DIR *node;
struct dirent *de;
int read = 0;
int local = 0;
if (argc < 2) {
printf("usage: %s [0-3]\n", argv[0]);
printf(" bit0 - local/remote\n");
printf(" bit1 - read/write\n");
exit(0);
}
switch (atoi(argv[1])) {
case 0:
printf("remote write\n");
break;
case 1:
printf("local write\n");
local = 1;
break;
case 2:
printf("remote read\n");
read = 1;
break;
case 3:
printf("local read\n");
local = 1;
read = 1;
break;
}
mask = CPU_ALLOC(nrcpus);
size = CPU_ALLOC_SIZE(nrcpus);
CPU_ZERO_S(size, mask);
node = opendir("/sys/devices/system/node/node0/");
if (!node)
perror("opendir");
while ((de = readdir(node))) {
int cpu;
if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
CPU_SET_S(cpu, size, mask);
}
closedir(node);
mask2 = CPU_ALLOC(nrcpus);
CPU_ZERO_S(size, mask2);
for (i = 0; i < size; i++)
CPU_SET_S(i, size, mask2);
CPU_XOR_S(size, mask2, mask2, mask); // invert
if (!local)
mask = mask2;
err = sched_setaffinity(0, size, mask);
if (err)
perror("sched_setaffinity");
mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
if (err)
perror("mbind");
signal(SIGALRM, sig_done);
alarm(5);
if (!read) {
while (!done) {
for (i = 0; i < SIZE; i++)
mem[i] = 0x01;
}
} else {
while (!done) {
for (i = 0; i < SIZE; i++)
t += *(volatile char *)(mem + i);
}
}
return 0;
}
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-06 06:59:25 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
perf/x86: Fix local vs remote memory events for NHM/WSM
Verified using the below proglet.. before:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,101,554 node-stores
2,096,931 node-store-misses
5.021546079 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
501,137 node-stores
199 node-store-misses
5.124451068 seconds time elapsed
After:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,107,516 node-stores
2,097,187 node-store-misses
5.012755149 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
2,063,355 node-stores
165 node-store-misses
5.082091494 seconds time elapsed
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>
#define SIZE (32*1024*1024)
volatile int done;
void sig_done(int sig)
{
done = 1;
}
int main(int argc, char **argv)
{
cpu_set_t *mask, *mask2;
size_t size;
int i, err, t;
int nrcpus = 1024;
char *mem;
unsigned long nodemask = 0x01; /* node 0 */
DIR *node;
struct dirent *de;
int read = 0;
int local = 0;
if (argc < 2) {
printf("usage: %s [0-3]\n", argv[0]);
printf(" bit0 - local/remote\n");
printf(" bit1 - read/write\n");
exit(0);
}
switch (atoi(argv[1])) {
case 0:
printf("remote write\n");
break;
case 1:
printf("local write\n");
local = 1;
break;
case 2:
printf("remote read\n");
read = 1;
break;
case 3:
printf("local read\n");
local = 1;
read = 1;
break;
}
mask = CPU_ALLOC(nrcpus);
size = CPU_ALLOC_SIZE(nrcpus);
CPU_ZERO_S(size, mask);
node = opendir("/sys/devices/system/node/node0/");
if (!node)
perror("opendir");
while ((de = readdir(node))) {
int cpu;
if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
CPU_SET_S(cpu, size, mask);
}
closedir(node);
mask2 = CPU_ALLOC(nrcpus);
CPU_ZERO_S(size, mask2);
for (i = 0; i < size; i++)
CPU_SET_S(i, size, mask2);
CPU_XOR_S(size, mask2, mask2, mask); // invert
if (!local)
mask = mask2;
err = sched_setaffinity(0, size, mask);
if (err)
perror("sched_setaffinity");
mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
if (err)
perror("mbind");
signal(SIGALRM, sig_done);
alarm(5);
if (!read) {
while (!done) {
for (i = 0; i < SIZE; i++)
mem[i] = 0x01;
}
} else {
while (!done) {
for (i = 0; i < SIZE; i++)
t += *(volatile char *)(mem + i);
}
}
return 0;
}
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-06 06:59:25 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
perf/x86: Fix local vs remote memory events for NHM/WSM
Verified using the below proglet.. before:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,101,554 node-stores
2,096,931 node-store-misses
5.021546079 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
501,137 node-stores
199 node-store-misses
5.124451068 seconds time elapsed
After:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,107,516 node-stores
2,097,187 node-store-misses
5.012755149 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
2,063,355 node-stores
165 node-store-misses
5.082091494 seconds time elapsed
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>
#define SIZE (32*1024*1024)
volatile int done;
void sig_done(int sig)
{
done = 1;
}
int main(int argc, char **argv)
{
cpu_set_t *mask, *mask2;
size_t size;
int i, err, t;
int nrcpus = 1024;
char *mem;
unsigned long nodemask = 0x01; /* node 0 */
DIR *node;
struct dirent *de;
int read = 0;
int local = 0;
if (argc < 2) {
printf("usage: %s [0-3]\n", argv[0]);
printf(" bit0 - local/remote\n");
printf(" bit1 - read/write\n");
exit(0);
}
switch (atoi(argv[1])) {
case 0:
printf("remote write\n");
break;
case 1:
printf("local write\n");
local = 1;
break;
case 2:
printf("remote read\n");
read = 1;
break;
case 3:
printf("local read\n");
local = 1;
read = 1;
break;
}
mask = CPU_ALLOC(nrcpus);
size = CPU_ALLOC_SIZE(nrcpus);
CPU_ZERO_S(size, mask);
node = opendir("/sys/devices/system/node/node0/");
if (!node)
perror("opendir");
while ((de = readdir(node))) {
int cpu;
if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
CPU_SET_S(cpu, size, mask);
}
closedir(node);
mask2 = CPU_ALLOC(nrcpus);
CPU_ZERO_S(size, mask2);
for (i = 0; i < size; i++)
CPU_SET_S(i, size, mask2);
CPU_XOR_S(size, mask2, mask2, mask); // invert
if (!local)
mask = mask2;
err = sched_setaffinity(0, size, mask);
if (err)
perror("sched_setaffinity");
mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
if (err)
perror("mbind");
signal(SIGALRM, sig_done);
alarm(5);
if (!read) {
while (!done) {
for (i = 0; i < SIZE; i++)
mem[i] = 0x01;
}
} else {
while (!done) {
for (i = 0; i < SIZE; i++)
t += *(volatile char *)(mem + i);
}
}
return 0;
}
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-06 06:59:25 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
},
|
2011-03-03 10:34:48 +08:00
|
|
|
};
|
|
|
|
|
2010-03-29 19:09:53 +08:00
|
|
|
static __initconst const u64 nehalem_hw_cache_event_ids
|
2010-02-26 19:05:05 +08:00
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
perf, x86: Update/fix Intel Nehalem cache events
Change the Nehalem cache events to use retired memory instruction counters
(similar to Westmere), this greatly improves the provided stats.
Using:
main ()
{
int i;
for (i = 0; i < 1000000000; i++) {
asm("mov (%%rsp), %%rbx;"
"mov %%rbx, (%%rsp);" : : : "rbx");
}
}
We find:
$ perf stat --repeat 10 -e instructions:u -e l1-dcache-loads:u -e l1-dcache-stores:u ./loop_1b_loads+stores
Performance counter stats for './loop_1b_loads+stores' (10 runs):
4,000,081,056 instructions:u # 0.000 IPC ( +- 0.000% )
4,999,502,846 l1-dcache-loads:u ( +- 0.008% )
1,000,034,832 l1-dcache-stores:u ( +- 0.000% )
1.565184942 seconds time elapsed ( +- 0.005% )
The 5b is surprising - we'd expect 1b:
$ perf stat --repeat 10 -e instructions:u -e r10b:u -e l1-dcache-stores:u ./loop_1b_loads+stores
Performance counter stats for './loop_1b_loads+stores' (10 runs):
4,000,081,054 instructions:u # 0.000 IPC ( +- 0.000% )
1,000,021,961 r10b:u ( +- 0.000% )
1,000,030,951 l1-dcache-stores:u ( +- 0.000% )
1.565055422 seconds time elapsed ( +- 0.003% )
Which this patch thus fixes.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Link: http://lkml.kernel.org/n/tip-q9rtru7b7840tws75xzboapv@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-04-22 19:39:56 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
perf, x86: Update/fix Intel Nehalem cache events
Change the Nehalem cache events to use retired memory instruction counters
(similar to Westmere), this greatly improves the provided stats.
Using:
main ()
{
int i;
for (i = 0; i < 1000000000; i++) {
asm("mov (%%rsp), %%rbx;"
"mov %%rbx, (%%rsp);" : : : "rbx");
}
}
We find:
$ perf stat --repeat 10 -e instructions:u -e l1-dcache-loads:u -e l1-dcache-stores:u ./loop_1b_loads+stores
Performance counter stats for './loop_1b_loads+stores' (10 runs):
4,000,081,056 instructions:u # 0.000 IPC ( +- 0.000% )
4,999,502,846 l1-dcache-loads:u ( +- 0.008% )
1,000,034,832 l1-dcache-stores:u ( +- 0.000% )
1.565184942 seconds time elapsed ( +- 0.005% )
The 5b is surprising - we'd expect 1b:
$ perf stat --repeat 10 -e instructions:u -e r10b:u -e l1-dcache-stores:u ./loop_1b_loads+stores
Performance counter stats for './loop_1b_loads+stores' (10 runs):
4,000,081,054 instructions:u # 0.000 IPC ( +- 0.000% )
1,000,021,961 r10b:u ( +- 0.000% )
1,000,030,951 l1-dcache-stores:u ( +- 0.000% )
1.565055422 seconds time elapsed ( +- 0.003% )
Which this patch thus fixes.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Link: http://lkml.kernel.org/n/tip-q9rtru7b7840tws75xzboapv@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-04-22 19:39:56 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
2011-03-03 10:34:48 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
2011-03-03 10:34:48 +08:00
|
|
|
/*
|
|
|
|
* Use RFO, not WRITEBACK, because a write miss would typically occur
|
|
|
|
* on RFO.
|
|
|
|
*/
|
2010-02-26 19:05:05 +08:00
|
|
|
[ C(OP_WRITE) ] = {
|
2011-03-03 10:34:48 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
2011-03-03 10:34:48 +08:00
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
2011-04-23 05:37:06 +08:00
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
},
|
2010-02-26 19:05:05 +08:00
|
|
|
};
|
|
|
|
|
2010-03-29 19:09:53 +08:00
|
|
|
static __initconst const u64 core2_hw_cache_event_ids
|
2010-02-26 19:05:05 +08:00
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
|
|
|
|
[ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2010-03-29 19:09:53 +08:00
|
|
|
static __initconst const u64 atom_hw_cache_event_ids
|
2010-02-26 19:05:05 +08:00
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
|
|
|
|
[ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2016-05-20 08:09:58 +08:00
|
|
|
EVENT_ATTR_STR(topdown-total-slots, td_total_slots_slm, "event=0x3c");
|
|
|
|
EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_slm, "2");
|
|
|
|
/* no_alloc_cycles.not_delivered */
|
|
|
|
EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_slm,
|
|
|
|
"event=0xca,umask=0x50");
|
|
|
|
EVENT_ATTR_STR(topdown-fetch-bubbles.scale, td_fetch_bubbles_scale_slm, "2");
|
|
|
|
/* uops_retired.all */
|
|
|
|
EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_slm,
|
|
|
|
"event=0xc2,umask=0x10");
|
|
|
|
/* uops_retired.all */
|
|
|
|
EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_slm,
|
|
|
|
"event=0xc2,umask=0x10");
|
|
|
|
|
|
|
|
static struct attribute *slm_events_attrs[] = {
|
|
|
|
EVENT_PTR(td_total_slots_slm),
|
|
|
|
EVENT_PTR(td_total_slots_scale_slm),
|
|
|
|
EVENT_PTR(td_fetch_bubbles_slm),
|
|
|
|
EVENT_PTR(td_fetch_bubbles_scale_slm),
|
|
|
|
EVENT_PTR(td_slots_issued_slm),
|
|
|
|
EVENT_PTR(td_slots_retired_slm),
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2013-07-18 17:02:24 +08:00
|
|
|
static struct extra_reg intel_slm_extra_regs[] __read_mostly =
|
|
|
|
{
|
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
perf/x86/intel: Fix Silvermont offcore masks
Fengguang Wu reported:
> sparse warnings: (new ones prefixed by >>)
>
> >> arch/x86/kernel/cpu/perf_event_intel.c:901:9: sparse: constant 0x768005ffff is so big it is long
> >> arch/x86/kernel/cpu/perf_event_intel.c:902:9: sparse: constant 0x768005ffff is so big it is long
>
> vim +901 arch/x86/kernel/cpu/perf_event_intel.c
>
> 895 },
> 896 };
> 897
> 898 static struct extra_reg intel_slm_extra_regs[] __read_mostly =
> 899 {
> 900 /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
> > 901 INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffff, RSP_0),
> > 902 INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffff, RSP_1),
> 903 EVENT_EXTRA_END
> 904 };
> 905
Extend those constants to 64 bits.
Reported-by: fengguang.wu@intel.com
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20130909112636.GQ31370@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-09 19:26:36 +08:00
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
|
2015-06-25 02:23:35 +08:00
|
|
|
INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
|
2013-07-18 17:02:24 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
|
|
|
#define SLM_DMND_READ SNB_DMND_DATA_RD
|
|
|
|
#define SLM_DMND_WRITE SNB_DMND_RFO
|
|
|
|
#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
|
|
|
|
|
|
|
|
#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
|
|
|
|
#define SLM_LLC_ACCESS SNB_RESP_ANY
|
|
|
|
#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 slm_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
|
2015-04-21 17:34:41 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0,
|
2013-07-18 17:02:24 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static __initconst const u64 slm_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2015-04-21 17:34:41 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0,
|
2013-07-18 17:02:24 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
|
2015-04-21 17:34:41 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */
|
2013-07-18 17:02:24 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
|
|
|
|
[ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2017-02-10 15:23:58 +08:00
|
|
|
EVENT_ATTR_STR(topdown-total-slots, td_total_slots_glm, "event=0x3c");
|
|
|
|
EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_glm, "3");
|
|
|
|
/* UOPS_NOT_DELIVERED.ANY */
|
|
|
|
EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_glm, "event=0x9c");
|
|
|
|
/* ISSUE_SLOTS_NOT_CONSUMED.RECOVERY */
|
|
|
|
EVENT_ATTR_STR(topdown-recovery-bubbles, td_recovery_bubbles_glm, "event=0xca,umask=0x02");
|
|
|
|
/* UOPS_RETIRED.ANY */
|
|
|
|
EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_glm, "event=0xc2");
|
|
|
|
/* UOPS_ISSUED.ANY */
|
|
|
|
EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_glm, "event=0x0e");
|
|
|
|
|
|
|
|
static struct attribute *glm_events_attrs[] = {
|
|
|
|
EVENT_PTR(td_total_slots_glm),
|
|
|
|
EVENT_PTR(td_total_slots_scale_glm),
|
|
|
|
EVENT_PTR(td_fetch_bubbles_glm),
|
|
|
|
EVENT_PTR(td_recovery_bubbles_glm),
|
|
|
|
EVENT_PTR(td_slots_issued_glm),
|
|
|
|
EVENT_PTR(td_slots_retired_glm),
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2016-04-15 15:42:47 +08:00
|
|
|
static struct extra_reg intel_glm_extra_regs[] __read_mostly = {
|
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x760005ffbfull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x360005ffbfull, RSP_1),
|
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
|
|
|
#define GLM_DEMAND_DATA_RD BIT_ULL(0)
|
|
|
|
#define GLM_DEMAND_RFO BIT_ULL(1)
|
|
|
|
#define GLM_ANY_RESPONSE BIT_ULL(16)
|
|
|
|
#define GLM_SNP_NONE_OR_MISS BIT_ULL(33)
|
|
|
|
#define GLM_DEMAND_READ GLM_DEMAND_DATA_RD
|
|
|
|
#define GLM_DEMAND_WRITE GLM_DEMAND_RFO
|
|
|
|
#define GLM_DEMAND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
|
|
|
|
#define GLM_LLC_ACCESS GLM_ANY_RESPONSE
|
|
|
|
#define GLM_SNP_ANY (GLM_SNP_NONE_OR_MISS|SNB_NO_FWD|SNB_HITM)
|
|
|
|
#define GLM_LLC_MISS (GLM_SNP_ANY|SNB_NON_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 glm_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
|
|
|
|
[C(L1D)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(L1I)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0380, /* ICACHE.ACCESSES */
|
|
|
|
[C(RESULT_MISS)] = 0x0280, /* ICACHE.MISSES */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(LL)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(DTLB)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(ITLB)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x00c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[C(RESULT_MISS)] = 0x0481, /* ITLB.MISS */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(BPU)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[C(RESULT_MISS)] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static __initconst const u64 glm_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
|
|
|
|
[C(LL)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = GLM_DEMAND_READ|
|
|
|
|
GLM_LLC_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = GLM_DEMAND_READ|
|
|
|
|
GLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = GLM_DEMAND_WRITE|
|
|
|
|
GLM_LLC_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = GLM_DEMAND_WRITE|
|
|
|
|
GLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = GLM_DEMAND_PREFETCH|
|
|
|
|
GLM_LLC_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = GLM_DEMAND_PREFETCH|
|
|
|
|
GLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2017-07-12 21:44:23 +08:00
|
|
|
static __initconst const u64 glp_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
|
|
|
|
[C(L1D)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(L1I)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0380, /* ICACHE.ACCESSES */
|
|
|
|
[C(RESULT_MISS)] = 0x0280, /* ICACHE.MISSES */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(LL)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(DTLB)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
|
|
|
|
[C(RESULT_MISS)] = 0xe08, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
|
|
|
|
[C(RESULT_MISS)] = 0xe49, /* DTLB_STORE_MISSES.WALK_COMPLETED */
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(ITLB)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x00c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[C(RESULT_MISS)] = 0x0481, /* ITLB.MISS */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[C(BPU)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[C(RESULT_MISS)] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = -1,
|
|
|
|
[C(RESULT_MISS)] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static __initconst const u64 glp_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
|
|
|
|
[C(LL)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = GLM_DEMAND_READ|
|
|
|
|
GLM_LLC_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = GLM_DEMAND_READ|
|
|
|
|
GLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = GLM_DEMAND_WRITE|
|
|
|
|
GLM_LLC_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = GLM_DEMAND_WRITE|
|
|
|
|
GLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2019-04-11 02:57:09 +08:00
|
|
|
#define TNT_LOCAL_DRAM BIT_ULL(26)
|
|
|
|
#define TNT_DEMAND_READ GLM_DEMAND_DATA_RD
|
|
|
|
#define TNT_DEMAND_WRITE GLM_DEMAND_RFO
|
|
|
|
#define TNT_LLC_ACCESS GLM_ANY_RESPONSE
|
|
|
|
#define TNT_SNP_ANY (SNB_SNP_NOT_NEEDED|SNB_SNP_MISS| \
|
|
|
|
SNB_NO_FWD|SNB_SNP_FWD|SNB_HITM)
|
|
|
|
#define TNT_LLC_MISS (TNT_SNP_ANY|SNB_NON_DRAM|TNT_LOCAL_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 tnt_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
|
|
|
|
[C(LL)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = TNT_DEMAND_READ|
|
|
|
|
TNT_LLC_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = TNT_DEMAND_READ|
|
|
|
|
TNT_LLC_MISS,
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = TNT_DEMAND_WRITE|
|
|
|
|
TNT_LLC_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = TNT_DEMAND_WRITE|
|
|
|
|
TNT_LLC_MISS,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = 0x0,
|
|
|
|
[C(RESULT_MISS)] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct extra_reg intel_tnt_extra_regs[] __read_mostly = {
|
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
2020-05-01 20:54:42 +08:00
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff0ffffff9fffull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xff0ffffff9fffull, RSP_1),
|
2019-04-11 02:57:09 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2015-12-08 06:28:18 +08:00
|
|
|
#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
|
|
|
|
#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
|
|
|
|
#define KNL_MCDRAM_LOCAL BIT_ULL(21)
|
|
|
|
#define KNL_MCDRAM_FAR BIT_ULL(22)
|
|
|
|
#define KNL_DDR_LOCAL BIT_ULL(23)
|
|
|
|
#define KNL_DDR_FAR BIT_ULL(24)
|
|
|
|
#define KNL_DRAM_ANY (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
|
|
|
|
KNL_DDR_LOCAL | KNL_DDR_FAR)
|
|
|
|
#define KNL_L2_READ SLM_DMND_READ
|
|
|
|
#define KNL_L2_WRITE SLM_DMND_WRITE
|
|
|
|
#define KNL_L2_PREFETCH SLM_DMND_PREFETCH
|
|
|
|
#define KNL_L2_ACCESS SLM_LLC_ACCESS
|
|
|
|
#define KNL_L2_MISS (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
|
|
|
|
KNL_DRAM_ANY | SNB_SNP_ANY | \
|
|
|
|
SNB_NON_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 knl_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
|
|
|
|
[C(LL)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = 0,
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = KNL_L2_WRITE | KNL_L2_MISS,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = KNL_L2_PREFETCH | KNL_L2_MISS,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
/*
|
perf/x86/intel: Fix PEBS warning by only restoring active PMU in pmi
This patch tries to fix a PEBS warning found in my stress test. The
following perf command can easily trigger the pebs warning or spurious
NMI error on Skylake/Broadwell/Haswell platforms:
sudo perf record -e 'cpu/umask=0x04,event=0xc4/pp,cycles,branches,ref-cycles,cache-misses,cache-references' --call-graph fp -b -c1000 -a
Also the NMI watchdog must be enabled.
For this case, the events number is larger than counter number. So
perf has to do multiplexing.
In perf_mux_hrtimer_handler, it does perf_pmu_disable(), schedule out
old events, rotate_ctx, schedule in new events and finally
perf_pmu_enable().
If the old events include precise event, the MSR_IA32_PEBS_ENABLE
should be cleared when perf_pmu_disable(). The MSR_IA32_PEBS_ENABLE
should keep 0 until the perf_pmu_enable() is called and the new event is
precise event.
However, there is a corner case which could restore PEBS_ENABLE to
stale value during the above period. In perf_pmu_disable(), GLOBAL_CTRL
will be set to 0 to stop overflow and followed PMI. But there may be
pending PMI from an earlier overflow, which cannot be stopped. So even
GLOBAL_CTRL is cleared, the kernel still be possible to get PMI. At
the end of the PMI handler, __intel_pmu_enable_all() will be called,
which will restore the stale values if old events haven't scheduled
out.
Once the stale pebs value is set, it's impossible to be corrected if
the new events are non-precise. Because the pebs_enabled will be set
to 0. x86_pmu.enable_all() will ignore the MSR_IA32_PEBS_ENABLE
setting. As a result, the following NMI with stale PEBS_ENABLE
trigger pebs warning.
The pending PMI after enabled=0 will become harmless if the NMI handler
does not change the state. This patch checks cpuc->enabled in pmi and
only restore the state when PMU is active.
Here is the dump:
Call Trace:
<NMI> [<ffffffff813c3a2e>] dump_stack+0x63/0x85
[<ffffffff810a46f2>] warn_slowpath_common+0x82/0xc0
[<ffffffff810a483a>] warn_slowpath_null+0x1a/0x20
[<ffffffff8100fe2e>] intel_pmu_drain_pebs_nhm+0x2be/0x320
[<ffffffff8100caa9>] intel_pmu_handle_irq+0x279/0x460
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff811f290d>] ? vunmap_page_range+0x20d/0x330
[<ffffffff811f2f11>] ? unmap_kernel_range_noflush+0x11/0x20
[<ffffffff8148379f>] ? ghes_copy_tofrom_phys+0x10f/0x2a0
[<ffffffff814839c8>] ? ghes_read_estatus+0x98/0x170
[<ffffffff81005a7d>] perf_event_nmi_handler+0x2d/0x50
[<ffffffff810310b9>] nmi_handle+0x69/0x120
[<ffffffff810316f6>] default_do_nmi+0xe6/0x100
[<ffffffff810317f2>] do_nmi+0xe2/0x130
[<ffffffff817aea71>] end_repeat_nmi+0x1a/0x1e
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
<<EOE>> <IRQ> [<ffffffff81006df8>] ? x86_perf_event_set_period+0xd8/0x180
[<ffffffff81006eec>] x86_pmu_start+0x4c/0x100
[<ffffffff8100722d>] x86_pmu_enable+0x28d/0x300
[<ffffffff811994d7>] perf_pmu_enable.part.81+0x7/0x10
[<ffffffff8119cb70>] perf_mux_hrtimer_handler+0x200/0x280
[<ffffffff8119c970>] ? __perf_install_in_context+0xc0/0xc0
[<ffffffff8110f92d>] __hrtimer_run_queues+0xfd/0x280
[<ffffffff811100d8>] hrtimer_interrupt+0xa8/0x190
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81051bd8>] local_apic_timer_interrupt+0x38/0x60
[<ffffffff817af01d>] smp_apic_timer_interrupt+0x3d/0x50
[<ffffffff817ad15c>] apic_timer_interrupt+0x8c/0xa0
<EOI> [<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81123de5>] ? smp_call_function_single+0xd5/0x130
[<ffffffff81123ddb>] ? smp_call_function_single+0xcb/0x130
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff8119765a>] event_function_call+0x10a/0x120
[<ffffffff8119c660>] ? ctx_resched+0x90/0x90
[<ffffffff811971e0>] ? cpu_clock_event_read+0x30/0x30
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff8119772b>] _perf_event_enable+0x5b/0x70
[<ffffffff81197388>] perf_event_for_each_child+0x38/0xa0
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff811a0ffd>] perf_ioctl+0x12d/0x3c0
[<ffffffff8134d855>] ? selinux_file_ioctl+0x95/0x1e0
[<ffffffff8124a3a1>] do_vfs_ioctl+0xa1/0x5a0
[<ffffffff81036d29>] ? sched_clock+0x9/0x10
[<ffffffff8124a919>] SyS_ioctl+0x79/0x90
[<ffffffff817ac4b2>] entry_SYSCALL_64_fastpath+0x1a/0xa4
---[ end trace aef202839fe9a71d ]---
Uhhuh. NMI received for unknown reason 2d on CPU 2.
Do you have a strange power saving mode enabled?
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1457046448-6184-1-git-send-email-kan.liang@intel.com
[ Fixed various typos and other small details. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-03-04 07:07:28 +08:00
|
|
|
* Used from PMIs where the LBRs are already disabled.
|
|
|
|
*
|
|
|
|
* This function could be called consecutively. It is required to remain in
|
|
|
|
* disabled state if called consecutively.
|
|
|
|
*
|
|
|
|
* During consecutive calls, the same disable value will be written to related
|
2016-09-15 16:22:33 +08:00
|
|
|
* registers, so the PMU state remains unchanged.
|
|
|
|
*
|
|
|
|
* intel_bts events don't coexist with intel PMU's BTS events because of
|
|
|
|
* x86_add_exclusive(x86_lbr_exclusive_lbr); there's no need to keep them
|
|
|
|
* disabled around intel PMU's event batching etc, only inside the PMI handler.
|
2020-01-22 02:13:38 +08:00
|
|
|
*
|
|
|
|
* Avoid PEBS_ENABLE MSR access in PMIs.
|
|
|
|
* The GLOBAL_CTRL has been disabled. All the counters do not count anymore.
|
|
|
|
* It doesn't matter if the PEBS is enabled or not.
|
|
|
|
* Usually, the PEBS status are not changed in PMIs. It's unnecessary to
|
|
|
|
* access PEBS_ENABLE MSR in disable_all()/enable_all().
|
|
|
|
* However, there are some cases which may change PEBS status, e.g. PMI
|
|
|
|
* throttle. The PEBS_ENABLE should be updated where the status changes.
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
*/
|
|
|
|
static void __intel_pmu_disable_all(void)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
|
|
|
|
|
2012-06-21 02:46:33 +08:00
|
|
|
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
|
2010-02-26 19:05:05 +08:00
|
|
|
intel_pmu_disable_bts();
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pmu_disable_all(void)
|
|
|
|
{
|
|
|
|
__intel_pmu_disable_all();
|
2020-01-22 02:13:38 +08:00
|
|
|
intel_pmu_pebs_disable_all();
|
2010-03-03 19:02:30 +08:00
|
|
|
intel_pmu_lbr_disable_all();
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
static void __intel_pmu_enable_all(int added, bool pmi)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
intel_pmu_lbr_enable_all(pmi);
|
2011-10-05 20:01:21 +08:00
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
|
|
|
|
x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2012-06-21 02:46:33 +08:00
|
|
|
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
|
2010-02-26 19:05:05 +08:00
|
|
|
struct perf_event *event =
|
2012-06-21 02:46:33 +08:00
|
|
|
cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
|
2010-02-26 19:05:05 +08:00
|
|
|
|
|
|
|
if (WARN_ON_ONCE(!event))
|
|
|
|
return;
|
|
|
|
|
|
|
|
intel_pmu_enable_bts(event->hw.config);
|
2016-09-15 16:22:33 +08:00
|
|
|
}
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
static void intel_pmu_enable_all(int added)
|
|
|
|
{
|
2020-01-22 02:13:38 +08:00
|
|
|
intel_pmu_pebs_enable_all();
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
__intel_pmu_enable_all(added, false);
|
|
|
|
}
|
|
|
|
|
2010-03-26 21:08:44 +08:00
|
|
|
/*
|
|
|
|
* Workaround for:
|
|
|
|
* Intel Errata AAK100 (model 26)
|
|
|
|
* Intel Errata AAP53 (model 30)
|
2010-03-29 22:37:17 +08:00
|
|
|
* Intel Errata BD53 (model 44)
|
2010-03-26 21:08:44 +08:00
|
|
|
*
|
2010-08-06 13:39:08 +08:00
|
|
|
* The official story:
|
|
|
|
* These chips need to be 'reset' when adding counters by programming the
|
|
|
|
* magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
|
|
|
|
* in sequence on the same PMC or on different PMCs.
|
|
|
|
*
|
|
|
|
* In practise it appears some of these events do in fact count, and
|
2018-12-03 17:47:34 +08:00
|
|
|
* we need to program all 4 events.
|
2010-03-26 21:08:44 +08:00
|
|
|
*/
|
2010-08-06 13:39:08 +08:00
|
|
|
static void intel_pmu_nhm_workaround(void)
|
2010-03-26 21:08:44 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-08-06 13:39:08 +08:00
|
|
|
static const unsigned long nhm_magic[4] = {
|
|
|
|
0x4300B5,
|
|
|
|
0x4300D2,
|
|
|
|
0x4300B1,
|
|
|
|
0x4300B1
|
|
|
|
};
|
|
|
|
struct perf_event *event;
|
|
|
|
int i;
|
2010-03-26 21:08:44 +08:00
|
|
|
|
2010-08-06 13:39:08 +08:00
|
|
|
/*
|
|
|
|
* The Errata requires below steps:
|
|
|
|
* 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
|
|
|
|
* 2) Configure 4 PERFEVTSELx with the magic events and clear
|
|
|
|
* the corresponding PMCx;
|
|
|
|
* 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
|
|
|
|
* 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
|
|
|
|
* 5) Clear 4 pairs of ERFEVTSELx and PMCx;
|
|
|
|
*/
|
2010-03-26 21:08:44 +08:00
|
|
|
|
2010-08-06 13:39:08 +08:00
|
|
|
/*
|
|
|
|
* The real steps we choose are a little different from above.
|
|
|
|
* A) To reduce MSR operations, we don't run step 1) as they
|
|
|
|
* are already cleared before this function is called;
|
|
|
|
* B) Call x86_perf_event_update to save PMCx before configuring
|
|
|
|
* PERFEVTSELx with magic number;
|
|
|
|
* C) With step 5), we do clear only when the PERFEVTSELx is
|
|
|
|
* not used currently.
|
|
|
|
* D) Call x86_perf_event_set_period to restore PMCx;
|
|
|
|
*/
|
2010-03-26 21:08:44 +08:00
|
|
|
|
2010-08-06 13:39:08 +08:00
|
|
|
/* We always operate 4 pairs of PERF Counters */
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
event = cpuc->events[i];
|
|
|
|
if (event)
|
|
|
|
x86_perf_event_update(event);
|
|
|
|
}
|
2010-03-26 21:08:44 +08:00
|
|
|
|
2010-08-06 13:39:08 +08:00
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
|
|
|
|
wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
|
|
|
|
}
|
|
|
|
|
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
|
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
|
2010-03-26 21:08:44 +08:00
|
|
|
|
2010-08-06 13:39:08 +08:00
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
event = cpuc->events[i];
|
|
|
|
|
|
|
|
if (event) {
|
|
|
|
x86_perf_event_set_period(event);
|
2010-04-14 04:23:14 +08:00
|
|
|
__x86_pmu_enable_event(&event->hw,
|
2010-08-06 13:39:08 +08:00
|
|
|
ARCH_PERFMON_EVENTSEL_ENABLE);
|
|
|
|
} else
|
|
|
|
wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
|
2010-03-26 21:08:44 +08:00
|
|
|
}
|
2010-08-06 13:39:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pmu_nhm_enable_all(int added)
|
|
|
|
{
|
|
|
|
if (added)
|
|
|
|
intel_pmu_nhm_workaround();
|
2010-03-26 21:08:44 +08:00
|
|
|
intel_pmu_enable_all(added);
|
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:18 +08:00
|
|
|
static void intel_set_tfa(struct cpu_hw_events *cpuc, bool on)
|
|
|
|
{
|
|
|
|
u64 val = on ? MSR_TFA_RTM_FORCE_ABORT : 0;
|
|
|
|
|
|
|
|
if (cpuc->tfa_shadow != val) {
|
|
|
|
cpuc->tfa_shadow = val;
|
|
|
|
wrmsrl(MSR_TSX_FORCE_ABORT, val);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_tfa_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We're going to use PMC3, make sure TFA is set before we touch it.
|
|
|
|
*/
|
2019-03-14 16:47:18 +08:00
|
|
|
if (cntr == 3)
|
2019-03-06 05:23:18 +08:00
|
|
|
intel_set_tfa(cpuc, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_tfa_pmu_enable_all(int added)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we find PMC3 is no longer used when we enable the PMU, we can
|
|
|
|
* clear TFA.
|
|
|
|
*/
|
|
|
|
if (!test_bit(3, cpuc->active_mask))
|
|
|
|
intel_set_tfa(cpuc, false);
|
|
|
|
|
|
|
|
intel_pmu_enable_all(added);
|
|
|
|
}
|
|
|
|
|
2018-08-08 15:12:07 +08:00
|
|
|
static void enable_counter_freeze(void)
|
|
|
|
{
|
|
|
|
update_debugctlmsr(get_debugctlmsr() |
|
|
|
|
DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void disable_counter_freeze(void)
|
|
|
|
{
|
|
|
|
update_debugctlmsr(get_debugctlmsr() &
|
|
|
|
~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
|
|
|
|
}
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
static inline u64 intel_pmu_get_status(void)
|
|
|
|
{
|
|
|
|
u64 status;
|
|
|
|
|
|
|
|
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
|
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void intel_pmu_ack_status(u64 ack)
|
|
|
|
{
|
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
|
|
|
|
}
|
|
|
|
|
2020-06-13 16:09:47 +08:00
|
|
|
static inline bool event_is_checkpointed(struct perf_event *event)
|
|
|
|
{
|
|
|
|
return unlikely(event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void intel_set_masks(struct perf_event *event, int idx)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
|
|
|
|
if (event->attr.exclude_host)
|
|
|
|
__set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
|
|
|
|
if (event->attr.exclude_guest)
|
|
|
|
__set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
|
|
|
|
if (event_is_checkpointed(event))
|
|
|
|
__set_bit(idx, (unsigned long *)&cpuc->intel_cp_status);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void intel_clear_masks(struct perf_event *event, int idx)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2020-06-13 16:09:47 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
|
|
|
|
__clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
|
|
|
|
__clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
|
|
|
|
__clear_bit(idx, (unsigned long *)&cpuc->intel_cp_status);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pmu_disable_fixed(struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
2012-06-21 02:46:33 +08:00
|
|
|
int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
|
2010-02-26 19:05:05 +08:00
|
|
|
u64 ctrl_val, mask;
|
|
|
|
|
|
|
|
mask = 0xfULL << (idx * 4);
|
|
|
|
|
|
|
|
rdmsrl(hwc->config_base, ctrl_val);
|
|
|
|
ctrl_val &= ~mask;
|
2010-03-08 20:51:31 +08:00
|
|
|
wrmsrl(hwc->config_base, ctrl_val);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
static void intel_pmu_disable_event(struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-03-03 03:32:08 +08:00
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
2020-06-13 16:09:47 +08:00
|
|
|
int idx = hwc->idx;
|
2010-03-03 03:32:08 +08:00
|
|
|
|
2020-06-13 16:09:47 +08:00
|
|
|
if (idx < INTEL_PMC_IDX_FIXED) {
|
|
|
|
intel_clear_masks(event, idx);
|
|
|
|
x86_pmu_disable_event(event);
|
|
|
|
} else if (idx < INTEL_PMC_IDX_FIXED_BTS) {
|
|
|
|
intel_clear_masks(event, idx);
|
|
|
|
intel_pmu_disable_fixed(event);
|
|
|
|
} else if (idx == INTEL_PMC_IDX_FIXED_BTS) {
|
2010-02-26 19:05:05 +08:00
|
|
|
intel_pmu_disable_bts();
|
|
|
|
intel_pmu_drain_bts_buffer();
|
2020-06-13 16:09:50 +08:00
|
|
|
} else if (idx == INTEL_PMC_IDX_FIXED_VLBR)
|
|
|
|
intel_clear_masks(event, idx);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2019-05-04 23:15:56 +08:00
|
|
|
/*
|
|
|
|
* Needs to be called after x86_pmu_disable_event,
|
|
|
|
* so we don't trigger the event without PEBS bit set.
|
|
|
|
*/
|
|
|
|
if (unlikely(event->attr.precise_ip))
|
|
|
|
intel_pmu_pebs_disable(event);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf/x86: Ensure perf_sched_cb_{inc,dec}() is only called from pmu::{add,del}()
Currently perf_sched_cb_{inc,dec}() are called from
pmu::{start,stop}(), which has the problem that this can happen from
NMI context, this is making it hard to optimize perf_pmu_sched_task().
Furthermore, we really only need this accounting on pmu::{add,del}(),
so doing it from pmu::{start,stop}() is doing more work than we really
need.
Introduce x86_pmu::{add,del}() and wire up the LBR and PEBS.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-07 00:02:43 +08:00
|
|
|
static void intel_pmu_del_event(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (needs_branch_stack(event))
|
|
|
|
intel_pmu_lbr_del(event);
|
|
|
|
if (event->attr.precise_ip)
|
|
|
|
intel_pmu_pebs_del(event);
|
|
|
|
}
|
|
|
|
|
2018-02-13 06:20:34 +08:00
|
|
|
static void intel_pmu_read_event(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
|
|
|
|
intel_pmu_auto_reload_read(event);
|
|
|
|
else
|
|
|
|
x86_perf_event_update(event);
|
|
|
|
}
|
|
|
|
|
2018-03-09 10:15:40 +08:00
|
|
|
static void intel_pmu_enable_fixed(struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2018-03-09 10:15:40 +08:00
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
2012-06-21 02:46:33 +08:00
|
|
|
int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
|
2018-03-09 10:15:40 +08:00
|
|
|
u64 ctrl_val, mask, bits = 0;
|
2010-02-26 19:05:05 +08:00
|
|
|
|
|
|
|
/*
|
2018-03-09 10:15:40 +08:00
|
|
|
* Enable IRQ generation (0x8), if not PEBS,
|
2010-02-26 19:05:05 +08:00
|
|
|
* and enable ring-3 counting (0x2) and ring-0 counting (0x1)
|
|
|
|
* if requested:
|
|
|
|
*/
|
2018-03-09 10:15:40 +08:00
|
|
|
if (!event->attr.precise_ip)
|
|
|
|
bits |= 0x8;
|
2010-02-26 19:05:05 +08:00
|
|
|
if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
|
|
|
|
bits |= 0x2;
|
|
|
|
if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
|
|
|
|
bits |= 0x1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ANY bit is supported in v3 and up
|
|
|
|
*/
|
|
|
|
if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
|
|
|
|
bits |= 0x4;
|
|
|
|
|
|
|
|
bits <<= (idx * 4);
|
|
|
|
mask = 0xfULL << (idx * 4);
|
|
|
|
|
perf/x86/intel: Support adaptive PEBS v4
Adaptive PEBS is a new way to report PEBS sampling information. Instead
of a fixed size record for all PEBS events it allows to configure the
PEBS record to only include the information needed. Events can then opt
in to use such an extended record, or stay with a basic record which
only contains the IP.
The major new feature is to support LBRs in PEBS record.
Besides normal LBR, this allows (much faster) large PEBS, while still
supporting callstacks through callstack LBR. So essentially a lot of
profiling can now be done without frequent interrupts, dropping the
overhead significantly.
The main requirement still is to use a period, and not use frequency
mode, because frequency mode requires reevaluating the frequency on each
overflow.
The floating point state (XMM) is also supported, which allows efficient
profiling of FP function arguments.
Introduce specific drain function to handle variable length records.
Use a new callback to parse the new record format, and also handle the
STATUS field now being at a different offset.
Add code to set up the configuration register. Since there is only a
single register, all events either get the full super set of all events,
or only the basic record.
Originally-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Cc: jolsa@kernel.org
Link: https://lkml.kernel.org/r/20190402194509.2832-6-kan.liang@linux.intel.com
[ Renamed GPRS => GP. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-04-03 03:45:02 +08:00
|
|
|
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) {
|
|
|
|
bits |= ICL_FIXED_0_ADAPTIVE << (idx * 4);
|
|
|
|
mask |= ICL_FIXED_0_ADAPTIVE << (idx * 4);
|
|
|
|
}
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
rdmsrl(hwc->config_base, ctrl_val);
|
|
|
|
ctrl_val &= ~mask;
|
|
|
|
ctrl_val |= bits;
|
2010-03-08 20:51:31 +08:00
|
|
|
wrmsrl(hwc->config_base, ctrl_val);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
2010-03-03 03:32:08 +08:00
|
|
|
static void intel_pmu_enable_event(struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-03-03 03:32:08 +08:00
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
2020-06-13 16:09:47 +08:00
|
|
|
int idx = hwc->idx;
|
2013-09-12 18:53:44 +08:00
|
|
|
|
2018-03-09 10:15:40 +08:00
|
|
|
if (unlikely(event->attr.precise_ip))
|
|
|
|
intel_pmu_pebs_enable(event);
|
|
|
|
|
2020-06-13 16:09:47 +08:00
|
|
|
if (idx < INTEL_PMC_IDX_FIXED) {
|
|
|
|
intel_set_masks(event, idx);
|
|
|
|
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
|
|
|
|
} else if (idx < INTEL_PMC_IDX_FIXED_BTS) {
|
|
|
|
intel_set_masks(event, idx);
|
2018-03-09 10:15:40 +08:00
|
|
|
intel_pmu_enable_fixed(event);
|
2020-06-13 16:09:47 +08:00
|
|
|
} else if (idx == INTEL_PMC_IDX_FIXED_BTS) {
|
|
|
|
if (!__this_cpu_read(cpu_hw_events.enabled))
|
|
|
|
return;
|
|
|
|
intel_pmu_enable_bts(hwc->config);
|
2020-06-13 16:09:50 +08:00
|
|
|
} else if (idx == INTEL_PMC_IDX_FIXED_VLBR)
|
|
|
|
intel_set_masks(event, idx);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf/x86: Ensure perf_sched_cb_{inc,dec}() is only called from pmu::{add,del}()
Currently perf_sched_cb_{inc,dec}() are called from
pmu::{start,stop}(), which has the problem that this can happen from
NMI context, this is making it hard to optimize perf_pmu_sched_task().
Furthermore, we really only need this accounting on pmu::{add,del}(),
so doing it from pmu::{start,stop}() is doing more work than we really
need.
Introduce x86_pmu::{add,del}() and wire up the LBR and PEBS.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-07 00:02:43 +08:00
|
|
|
static void intel_pmu_add_event(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (event->attr.precise_ip)
|
|
|
|
intel_pmu_pebs_add(event);
|
|
|
|
if (needs_branch_stack(event))
|
|
|
|
intel_pmu_lbr_add(event);
|
|
|
|
}
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
|
|
|
* Save and restart an expired event. Called by NMI contexts,
|
|
|
|
* so it has to be careful about preempting normal event ops:
|
|
|
|
*/
|
2011-08-31 07:41:05 +08:00
|
|
|
int intel_pmu_save_and_restart(struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-03-03 03:18:39 +08:00
|
|
|
x86_perf_event_update(event);
|
2013-09-06 11:37:38 +08:00
|
|
|
/*
|
|
|
|
* For a checkpointed counter always reset back to 0. This
|
|
|
|
* avoids a situation where the counter overflows, aborts the
|
|
|
|
* transaction and is then set back to shortly before the
|
|
|
|
* overflow, and overflows and aborts again.
|
|
|
|
*/
|
|
|
|
if (unlikely(event_is_checkpointed(event))) {
|
|
|
|
/* No race with NMIs because the counter should not be armed */
|
|
|
|
wrmsrl(event->hw.event_base, 0);
|
|
|
|
local64_set(&event->hw.prev_count, 0);
|
|
|
|
}
|
2010-03-03 03:18:39 +08:00
|
|
|
return x86_perf_event_set_period(event);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pmu_reset(void)
|
|
|
|
{
|
2010-12-18 23:28:55 +08:00
|
|
|
struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
|
2010-02-26 19:05:05 +08:00
|
|
|
unsigned long flags;
|
|
|
|
int idx;
|
|
|
|
|
2010-03-30 00:36:50 +08:00
|
|
|
if (!x86_pmu.num_counters)
|
2010-02-26 19:05:05 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2010-03-30 00:36:50 +08:00
|
|
|
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
|
2012-06-08 04:32:04 +08:00
|
|
|
wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
|
|
|
|
wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
2010-03-30 00:36:50 +08:00
|
|
|
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
|
2012-06-08 04:32:04 +08:00
|
|
|
wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
|
2010-03-30 00:36:50 +08:00
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
if (ds)
|
|
|
|
ds->bts_index = ds->bts_buffer_base;
|
|
|
|
|
2015-02-28 01:48:30 +08:00
|
|
|
/* Ack all overflows and disable fixed counters */
|
|
|
|
if (x86_pmu.version >= 2) {
|
|
|
|
intel_pmu_ack_status(intel_pmu_get_status());
|
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Reset LBRs and LBR freezing */
|
|
|
|
if (x86_pmu.lbr_nr) {
|
|
|
|
update_debugctlmsr(get_debugctlmsr() &
|
|
|
|
~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
|
|
|
|
}
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
|
2018-08-08 15:12:06 +08:00
|
|
|
static int handle_pmi_common(struct pt_regs *regs, u64 status)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
|
|
|
struct perf_sample_data data;
|
2018-08-08 15:12:06 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
int bit;
|
|
|
|
int handled = 0;
|
2010-02-26 19:05:05 +08:00
|
|
|
|
|
|
|
inc_irq_stat(apic_perf_irqs);
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
perf/x86/intel: ignore CondChgd bit to avoid false NMI handling
Currently, any NMI is falsely handled by a NMI handler of NMI watchdog
if CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR is set.
For example, we use external NMI to make system panic to get crash
dump, but in this case, the external NMI is falsely handled do to the
issue.
This commit deals with the issue simply by ignoring CondChgd bit.
Here is explanation in detail.
On x86 NMI watchdog uses performance monitoring feature to
periodically signal NMI each time performance counter gets overflowed.
intel_pmu_handle_irq() is called as a NMI_LOCAL handler from a NMI
handler of NMI watchdog, perf_event_nmi_handler(). It identifies an
owner of a given NMI by looking at overflow status bits in
MSR_CORE_PERF_GLOBAL_STATUS MSR. If some of the bits are set, then it
handles the given NMI as its own NMI.
The problem is that the intel_pmu_handle_irq() doesn't distinguish
CondChgd bit from other bits. Unlike the other status bits, CondChgd
bit doesn't represent overflow status for performance counters. Thus,
CondChgd bit cannot be thought of as a mark indicating a given NMI is
NMI watchdog's.
As a result, if CondChgd bit is set, any NMI is falsely handled by the
NMI handler of NMI watchdog. Also, if type of the falsely handled NMI
is either NMI_UNKNOWN, NMI_SERR or NMI_IO_CHECK, the corresponding
action is never performed until CondChgd bit is cleared.
I noticed this behavior on systems with Ivy Bridge processors: Intel
Xeon CPU E5-2630 v2 and Intel Xeon CPU E7-8890 v2. On both systems,
CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR has already been set
in the beginning at boot. Then the CondChgd bit is immediately cleared
by next wrmsr to MSR_CORE_PERF_GLOBAL_CTRL MSR and appears to remain
0.
On the other hand, on older processors such as Nehalem, Xeon E7540,
CondChgd bit is not set in the beginning at boot.
I'm not sure about exact behavior of CondChgd bit, in particular when
this bit is set. Although I read Intel System Programmer's Manual to
figure out that, the descriptions I found are:
In 18.9.1:
"The MSR_PERF_GLOBAL_STATUS MSR also provides a ¡sticky bit¢ to
indicate changes to the state of performancmonitoring hardware"
In Table 35-2 IA-32 Architectural MSRs
63 CondChg: status bits of this register has changed.
These are different from the bahviour I see on the actual system as I
explained above.
At least, I think ignoring CondChgd bit should be enough for NMI
watchdog perspective.
Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20140625.103503.409316067.d.hatayama@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-06-25 09:09:07 +08:00
|
|
|
/*
|
2015-05-11 03:22:45 +08:00
|
|
|
* Ignore a range of extra bits in status that do not indicate
|
|
|
|
* overflow by themselves.
|
perf/x86/intel: ignore CondChgd bit to avoid false NMI handling
Currently, any NMI is falsely handled by a NMI handler of NMI watchdog
if CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR is set.
For example, we use external NMI to make system panic to get crash
dump, but in this case, the external NMI is falsely handled do to the
issue.
This commit deals with the issue simply by ignoring CondChgd bit.
Here is explanation in detail.
On x86 NMI watchdog uses performance monitoring feature to
periodically signal NMI each time performance counter gets overflowed.
intel_pmu_handle_irq() is called as a NMI_LOCAL handler from a NMI
handler of NMI watchdog, perf_event_nmi_handler(). It identifies an
owner of a given NMI by looking at overflow status bits in
MSR_CORE_PERF_GLOBAL_STATUS MSR. If some of the bits are set, then it
handles the given NMI as its own NMI.
The problem is that the intel_pmu_handle_irq() doesn't distinguish
CondChgd bit from other bits. Unlike the other status bits, CondChgd
bit doesn't represent overflow status for performance counters. Thus,
CondChgd bit cannot be thought of as a mark indicating a given NMI is
NMI watchdog's.
As a result, if CondChgd bit is set, any NMI is falsely handled by the
NMI handler of NMI watchdog. Also, if type of the falsely handled NMI
is either NMI_UNKNOWN, NMI_SERR or NMI_IO_CHECK, the corresponding
action is never performed until CondChgd bit is cleared.
I noticed this behavior on systems with Ivy Bridge processors: Intel
Xeon CPU E5-2630 v2 and Intel Xeon CPU E7-8890 v2. On both systems,
CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR has already been set
in the beginning at boot. Then the CondChgd bit is immediately cleared
by next wrmsr to MSR_CORE_PERF_GLOBAL_CTRL MSR and appears to remain
0.
On the other hand, on older processors such as Nehalem, Xeon E7540,
CondChgd bit is not set in the beginning at boot.
I'm not sure about exact behavior of CondChgd bit, in particular when
this bit is set. Although I read Intel System Programmer's Manual to
figure out that, the descriptions I found are:
In 18.9.1:
"The MSR_PERF_GLOBAL_STATUS MSR also provides a ¡sticky bit¢ to
indicate changes to the state of performancmonitoring hardware"
In Table 35-2 IA-32 Architectural MSRs
63 CondChg: status bits of this register has changed.
These are different from the bahviour I see on the actual system as I
explained above.
At least, I think ignoring CondChgd bit should be enough for NMI
watchdog perspective.
Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20140625.103503.409316067.d.hatayama@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-06-25 09:09:07 +08:00
|
|
|
*/
|
2015-05-11 03:22:45 +08:00
|
|
|
status &= ~(GLOBAL_STATUS_COND_CHG |
|
|
|
|
GLOBAL_STATUS_ASIF |
|
|
|
|
GLOBAL_STATUS_LBRS_FROZEN);
|
|
|
|
if (!status)
|
2018-08-08 15:12:06 +08:00
|
|
|
return 0;
|
2016-12-22 16:29:26 +08:00
|
|
|
/*
|
|
|
|
* In case multiple PEBS events are sampled at the same time,
|
|
|
|
* it is possible to have GLOBAL_STATUS bit 62 set indicating
|
|
|
|
* PEBS buffer overflow and also seeing at most 3 PEBS counters
|
|
|
|
* having their bits set in the status register. This is a sign
|
|
|
|
* that there was at least one PEBS record pending at the time
|
|
|
|
* of the PMU interrupt. PEBS counters must only be processed
|
|
|
|
* via the drain_pebs() calls and not via the regular sample
|
|
|
|
* processing loop coming after that the function, otherwise
|
|
|
|
* phony regular samples may be generated in the sampling buffer
|
|
|
|
* not marked with the EXACT tag. Another possibility is to have
|
|
|
|
* one PEBS event and at least one non-PEBS event whic hoverflows
|
|
|
|
* while PEBS has armed. In this case, bit 62 of GLOBAL_STATUS will
|
|
|
|
* not be set, yet the overflow status bit for the PEBS counter will
|
|
|
|
* be on Skylake.
|
|
|
|
*
|
|
|
|
* To avoid this problem, we systematically ignore the PEBS-enabled
|
|
|
|
* counters from the GLOBAL_STATUS mask and we always process PEBS
|
|
|
|
* events via drain_pebs().
|
|
|
|
*/
|
2018-03-09 10:15:41 +08:00
|
|
|
if (x86_pmu.flags & PMU_FL_PEBS_ALL)
|
|
|
|
status &= ~cpuc->pebs_enabled;
|
|
|
|
else
|
|
|
|
status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
|
perf/x86/intel: ignore CondChgd bit to avoid false NMI handling
Currently, any NMI is falsely handled by a NMI handler of NMI watchdog
if CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR is set.
For example, we use external NMI to make system panic to get crash
dump, but in this case, the external NMI is falsely handled do to the
issue.
This commit deals with the issue simply by ignoring CondChgd bit.
Here is explanation in detail.
On x86 NMI watchdog uses performance monitoring feature to
periodically signal NMI each time performance counter gets overflowed.
intel_pmu_handle_irq() is called as a NMI_LOCAL handler from a NMI
handler of NMI watchdog, perf_event_nmi_handler(). It identifies an
owner of a given NMI by looking at overflow status bits in
MSR_CORE_PERF_GLOBAL_STATUS MSR. If some of the bits are set, then it
handles the given NMI as its own NMI.
The problem is that the intel_pmu_handle_irq() doesn't distinguish
CondChgd bit from other bits. Unlike the other status bits, CondChgd
bit doesn't represent overflow status for performance counters. Thus,
CondChgd bit cannot be thought of as a mark indicating a given NMI is
NMI watchdog's.
As a result, if CondChgd bit is set, any NMI is falsely handled by the
NMI handler of NMI watchdog. Also, if type of the falsely handled NMI
is either NMI_UNKNOWN, NMI_SERR or NMI_IO_CHECK, the corresponding
action is never performed until CondChgd bit is cleared.
I noticed this behavior on systems with Ivy Bridge processors: Intel
Xeon CPU E5-2630 v2 and Intel Xeon CPU E7-8890 v2. On both systems,
CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR has already been set
in the beginning at boot. Then the CondChgd bit is immediately cleared
by next wrmsr to MSR_CORE_PERF_GLOBAL_CTRL MSR and appears to remain
0.
On the other hand, on older processors such as Nehalem, Xeon E7540,
CondChgd bit is not set in the beginning at boot.
I'm not sure about exact behavior of CondChgd bit, in particular when
this bit is set. Although I read Intel System Programmer's Manual to
figure out that, the descriptions I found are:
In 18.9.1:
"The MSR_PERF_GLOBAL_STATUS MSR also provides a ¡sticky bit¢ to
indicate changes to the state of performancmonitoring hardware"
In Table 35-2 IA-32 Architectural MSRs
63 CondChg: status bits of this register has changed.
These are different from the bahviour I see on the actual system as I
explained above.
At least, I think ignoring CondChgd bit should be enough for NMI
watchdog perspective.
Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20140625.103503.409316067.d.hatayama@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-06-25 09:09:07 +08:00
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
/*
|
|
|
|
* PEBS overflow sets bit 62 in the global status register
|
|
|
|
*/
|
2020-07-24 01:11:05 +08:00
|
|
|
if (__test_and_clear_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, (unsigned long *)&status)) {
|
2020-01-22 02:13:38 +08:00
|
|
|
u64 pebs_enabled = cpuc->pebs_enabled;
|
|
|
|
|
2010-09-03 03:07:49 +08:00
|
|
|
handled++;
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
x86_pmu.drain_pebs(regs);
|
perf/x86/pebs: Add workaround for broken OVFL status on HSW+
This patch fixes an issue with the GLOBAL_OVERFLOW_STATUS bits on
Haswell, Broadwell and Skylake processors when using PEBS.
The SDM stipulates that when the PEBS iterrupt threshold is crossed,
an interrupt is posted and the kernel is interrupted. The kernel will
find GLOBAL_OVF_SATUS bit 62 set indicating there are PEBS records to
drain. But the bits corresponding to the actual counters should NOT be
set. The kernel follows the SDM and assumes that all PEBS events are
processed in the drain_pebs() callback. The kernel then checks for
remaining overflows on any other (non-PEBS) events and processes these
in the for_each_bit_set(&status) loop.
As it turns out, under certain conditions on HSW and later processors,
on PEBS buffer interrupt, bit 62 is set but the counter bits may be
set as well. In that case, the kernel drains PEBS and generates
SAMPLES with the EXACT tag, then it processes the counter bits, and
generates normal (non-EXACT) SAMPLES.
I ran into this problem by trying to understand why on HSW sampling on
a PEBS event was sometimes returning SAMPLES without the EXACT tag.
This should not happen on user level code because HSW has the
eventing_ip which always point to the instruction that caused the
event.
The workaround in this patch simply ensures that the bits for the
counters used for PEBS events are cleared after the PEBS buffer has
been drained. With this fix 100% of the PEBS samples on my user code
report the EXACT tag.
Before:
$ perf record -e cpu/event=0xd0,umask=0x81/upp ./multichase
$ perf report -D | fgrep SAMPLES
PERF_RECORD_SAMPLE(IP, 0x2): 11775/11775: 0x406de5 period: 73469 addr: 0 exact=Y
\--- EXACT tag is missing
After:
$ perf record -e cpu/event=0xd0,umask=0x81/upp ./multichase
$ perf report -D | fgrep SAMPLES
PERF_RECORD_SAMPLE(IP, 0x4002): 11775/11775: 0x406de5 period: 73469 addr: 0 exact=Y
\--- EXACT tag is set
The problem tends to appear more often when multiple PEBS events are used.
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: namhyung@kernel.org
Link: http://lkml.kernel.org/r/1457034642-21837-3-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-03-04 03:50:41 +08:00
|
|
|
status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
|
2020-01-22 02:13:38 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* PMI throttle may be triggered, which stops the PEBS event.
|
|
|
|
* Although cpuc->pebs_enabled is updated accordingly, the
|
|
|
|
* MSR_IA32_PEBS_ENABLE is not updated. Because the
|
|
|
|
* cpuc->enabled has been forced to 0 in PMI.
|
|
|
|
* Update the MSR if pebs_enabled is changed.
|
|
|
|
*/
|
|
|
|
if (pebs_enabled != cpuc->pebs_enabled)
|
|
|
|
wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
|
2010-09-03 03:07:49 +08:00
|
|
|
}
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
2015-01-30 18:39:52 +08:00
|
|
|
/*
|
|
|
|
* Intel PT
|
|
|
|
*/
|
2020-07-24 01:11:05 +08:00
|
|
|
if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) {
|
2015-01-30 18:39:52 +08:00
|
|
|
handled++;
|
2019-02-19 08:26:07 +08:00
|
|
|
if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() &&
|
|
|
|
perf_guest_cbs->handle_intel_pt_intr))
|
|
|
|
perf_guest_cbs->handle_intel_pt_intr();
|
|
|
|
else
|
|
|
|
intel_pt_interrupt();
|
2015-01-30 18:39:52 +08:00
|
|
|
}
|
|
|
|
|
2013-09-06 11:37:38 +08:00
|
|
|
/*
|
2013-09-12 18:53:44 +08:00
|
|
|
* Checkpointed counters can lead to 'spurious' PMIs because the
|
|
|
|
* rollback caused by the PMI will have cleared the overflow status
|
|
|
|
* bit. Therefore always force probe these counters.
|
2013-09-06 11:37:38 +08:00
|
|
|
*/
|
2013-09-12 18:53:44 +08:00
|
|
|
status |= cpuc->intel_cp_status;
|
2013-09-06 11:37:38 +08:00
|
|
|
|
2010-03-06 05:41:37 +08:00
|
|
|
for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
|
2010-02-26 19:05:05 +08:00
|
|
|
struct perf_event *event = cpuc->events[bit];
|
|
|
|
|
2010-09-03 03:07:49 +08:00
|
|
|
handled++;
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
if (!test_bit(bit, cpuc->active_mask))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!intel_pmu_save_and_restart(event))
|
|
|
|
continue;
|
|
|
|
|
2012-04-03 02:19:08 +08:00
|
|
|
perf_sample_data_init(&data, 0, event->hw.last_period);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2012-02-10 06:20:57 +08:00
|
|
|
if (has_branch_stack(event))
|
|
|
|
data.br_stack = &cpuc->lbr_stack;
|
|
|
|
|
2011-06-27 20:41:57 +08:00
|
|
|
if (perf_event_overflow(event, &data, regs))
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
x86_pmu_stop(event, 0);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
2018-08-08 15:12:06 +08:00
|
|
|
return handled;
|
|
|
|
}
|
|
|
|
|
2018-11-21 01:08:42 +08:00
|
|
|
static bool disable_counter_freezing = true;
|
2018-08-08 15:12:07 +08:00
|
|
|
static int __init intel_perf_counter_freezing_setup(char *s)
|
|
|
|
{
|
2018-11-21 01:08:42 +08:00
|
|
|
bool res;
|
|
|
|
|
|
|
|
if (kstrtobool(s, &res))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
disable_counter_freezing = !res;
|
2018-08-08 15:12:07 +08:00
|
|
|
return 1;
|
|
|
|
}
|
2018-11-21 01:08:42 +08:00
|
|
|
__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup);
|
2018-08-08 15:12:07 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Simplified handler for Arch Perfmon v4:
|
|
|
|
* - We rely on counter freezing/unfreezing to enable/disable the PMU.
|
|
|
|
* This is done automatically on PMU ack.
|
|
|
|
* - Ack the PMU only after the APIC.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int intel_pmu_handle_irq_v4(struct pt_regs *regs)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
int handled = 0;
|
|
|
|
bool bts = false;
|
|
|
|
u64 status;
|
|
|
|
int pmu_enabled = cpuc->enabled;
|
|
|
|
int loops = 0;
|
|
|
|
|
|
|
|
/* PMU has been disabled because of counter freezing */
|
|
|
|
cpuc->enabled = 0;
|
|
|
|
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
|
|
|
|
bts = true;
|
|
|
|
intel_bts_disable_local();
|
|
|
|
handled = intel_pmu_drain_bts_buffer();
|
|
|
|
handled += intel_bts_interrupt();
|
|
|
|
}
|
|
|
|
status = intel_pmu_get_status();
|
|
|
|
if (!status)
|
|
|
|
goto done;
|
|
|
|
again:
|
|
|
|
intel_pmu_lbr_read();
|
|
|
|
if (++loops > 100) {
|
|
|
|
static bool warned;
|
|
|
|
|
|
|
|
if (!warned) {
|
|
|
|
WARN(1, "perfevents: irq loop stuck!\n");
|
|
|
|
perf_event_print_debug();
|
|
|
|
warned = true;
|
|
|
|
}
|
|
|
|
intel_pmu_reset();
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
handled += handle_pmi_common(regs, status);
|
|
|
|
done:
|
|
|
|
/* Ack the PMI in the APIC */
|
|
|
|
apic_write(APIC_LVTPC, APIC_DM_NMI);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The counters start counting immediately while ack the status.
|
|
|
|
* Make it as close as possible to IRET. This avoids bogus
|
|
|
|
* freezing on Skylake CPUs.
|
|
|
|
*/
|
|
|
|
if (status) {
|
|
|
|
intel_pmu_ack_status(status);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* CPU may issues two PMIs very close to each other.
|
|
|
|
* When the PMI handler services the first one, the
|
|
|
|
* GLOBAL_STATUS is already updated to reflect both.
|
|
|
|
* When it IRETs, the second PMI is immediately
|
|
|
|
* handled and it sees clear status. At the meantime,
|
|
|
|
* there may be a third PMI, because the freezing bit
|
|
|
|
* isn't set since the ack in first PMI handlers.
|
|
|
|
* Double check if there is more work to be done.
|
|
|
|
*/
|
|
|
|
status = intel_pmu_get_status();
|
|
|
|
if (status)
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (bts)
|
|
|
|
intel_bts_enable_local();
|
|
|
|
cpuc->enabled = pmu_enabled;
|
|
|
|
return handled;
|
|
|
|
}
|
|
|
|
|
2018-08-08 15:12:06 +08:00
|
|
|
/*
|
|
|
|
* This handler is triggered by the local APIC, so the APIC IRQ handling
|
|
|
|
* rules apply:
|
|
|
|
*/
|
|
|
|
static int intel_pmu_handle_irq(struct pt_regs *regs)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc;
|
|
|
|
int loops;
|
|
|
|
u64 status;
|
|
|
|
int handled;
|
|
|
|
int pmu_enabled;
|
|
|
|
|
|
|
|
cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Save the PMU state.
|
|
|
|
* It needs to be restored when leaving the handler.
|
|
|
|
*/
|
|
|
|
pmu_enabled = cpuc->enabled;
|
|
|
|
/*
|
|
|
|
* No known reason to not always do late ACK,
|
|
|
|
* but just in case do it opt-in.
|
|
|
|
*/
|
|
|
|
if (!x86_pmu.late_ack)
|
|
|
|
apic_write(APIC_LVTPC, APIC_DM_NMI);
|
|
|
|
intel_bts_disable_local();
|
|
|
|
cpuc->enabled = 0;
|
|
|
|
__intel_pmu_disable_all();
|
|
|
|
handled = intel_pmu_drain_bts_buffer();
|
|
|
|
handled += intel_bts_interrupt();
|
|
|
|
status = intel_pmu_get_status();
|
|
|
|
if (!status)
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
loops = 0;
|
|
|
|
again:
|
|
|
|
intel_pmu_lbr_read();
|
|
|
|
intel_pmu_ack_status(status);
|
|
|
|
if (++loops > 100) {
|
|
|
|
static bool warned;
|
|
|
|
|
|
|
|
if (!warned) {
|
|
|
|
WARN(1, "perfevents: irq loop stuck!\n");
|
|
|
|
perf_event_print_debug();
|
|
|
|
warned = true;
|
|
|
|
}
|
|
|
|
intel_pmu_reset();
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
handled += handle_pmi_common(regs, status);
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
|
|
|
* Repeat if there is more work to be done:
|
|
|
|
*/
|
|
|
|
status = intel_pmu_get_status();
|
|
|
|
if (status)
|
|
|
|
goto again;
|
|
|
|
|
2010-03-08 20:51:01 +08:00
|
|
|
done:
|
perf/x86/intel: Fix PEBS warning by only restoring active PMU in pmi
This patch tries to fix a PEBS warning found in my stress test. The
following perf command can easily trigger the pebs warning or spurious
NMI error on Skylake/Broadwell/Haswell platforms:
sudo perf record -e 'cpu/umask=0x04,event=0xc4/pp,cycles,branches,ref-cycles,cache-misses,cache-references' --call-graph fp -b -c1000 -a
Also the NMI watchdog must be enabled.
For this case, the events number is larger than counter number. So
perf has to do multiplexing.
In perf_mux_hrtimer_handler, it does perf_pmu_disable(), schedule out
old events, rotate_ctx, schedule in new events and finally
perf_pmu_enable().
If the old events include precise event, the MSR_IA32_PEBS_ENABLE
should be cleared when perf_pmu_disable(). The MSR_IA32_PEBS_ENABLE
should keep 0 until the perf_pmu_enable() is called and the new event is
precise event.
However, there is a corner case which could restore PEBS_ENABLE to
stale value during the above period. In perf_pmu_disable(), GLOBAL_CTRL
will be set to 0 to stop overflow and followed PMI. But there may be
pending PMI from an earlier overflow, which cannot be stopped. So even
GLOBAL_CTRL is cleared, the kernel still be possible to get PMI. At
the end of the PMI handler, __intel_pmu_enable_all() will be called,
which will restore the stale values if old events haven't scheduled
out.
Once the stale pebs value is set, it's impossible to be corrected if
the new events are non-precise. Because the pebs_enabled will be set
to 0. x86_pmu.enable_all() will ignore the MSR_IA32_PEBS_ENABLE
setting. As a result, the following NMI with stale PEBS_ENABLE
trigger pebs warning.
The pending PMI after enabled=0 will become harmless if the NMI handler
does not change the state. This patch checks cpuc->enabled in pmi and
only restore the state when PMU is active.
Here is the dump:
Call Trace:
<NMI> [<ffffffff813c3a2e>] dump_stack+0x63/0x85
[<ffffffff810a46f2>] warn_slowpath_common+0x82/0xc0
[<ffffffff810a483a>] warn_slowpath_null+0x1a/0x20
[<ffffffff8100fe2e>] intel_pmu_drain_pebs_nhm+0x2be/0x320
[<ffffffff8100caa9>] intel_pmu_handle_irq+0x279/0x460
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff811f290d>] ? vunmap_page_range+0x20d/0x330
[<ffffffff811f2f11>] ? unmap_kernel_range_noflush+0x11/0x20
[<ffffffff8148379f>] ? ghes_copy_tofrom_phys+0x10f/0x2a0
[<ffffffff814839c8>] ? ghes_read_estatus+0x98/0x170
[<ffffffff81005a7d>] perf_event_nmi_handler+0x2d/0x50
[<ffffffff810310b9>] nmi_handle+0x69/0x120
[<ffffffff810316f6>] default_do_nmi+0xe6/0x100
[<ffffffff810317f2>] do_nmi+0xe2/0x130
[<ffffffff817aea71>] end_repeat_nmi+0x1a/0x1e
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
<<EOE>> <IRQ> [<ffffffff81006df8>] ? x86_perf_event_set_period+0xd8/0x180
[<ffffffff81006eec>] x86_pmu_start+0x4c/0x100
[<ffffffff8100722d>] x86_pmu_enable+0x28d/0x300
[<ffffffff811994d7>] perf_pmu_enable.part.81+0x7/0x10
[<ffffffff8119cb70>] perf_mux_hrtimer_handler+0x200/0x280
[<ffffffff8119c970>] ? __perf_install_in_context+0xc0/0xc0
[<ffffffff8110f92d>] __hrtimer_run_queues+0xfd/0x280
[<ffffffff811100d8>] hrtimer_interrupt+0xa8/0x190
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81051bd8>] local_apic_timer_interrupt+0x38/0x60
[<ffffffff817af01d>] smp_apic_timer_interrupt+0x3d/0x50
[<ffffffff817ad15c>] apic_timer_interrupt+0x8c/0xa0
<EOI> [<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81123de5>] ? smp_call_function_single+0xd5/0x130
[<ffffffff81123ddb>] ? smp_call_function_single+0xcb/0x130
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff8119765a>] event_function_call+0x10a/0x120
[<ffffffff8119c660>] ? ctx_resched+0x90/0x90
[<ffffffff811971e0>] ? cpu_clock_event_read+0x30/0x30
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff8119772b>] _perf_event_enable+0x5b/0x70
[<ffffffff81197388>] perf_event_for_each_child+0x38/0xa0
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff811a0ffd>] perf_ioctl+0x12d/0x3c0
[<ffffffff8134d855>] ? selinux_file_ioctl+0x95/0x1e0
[<ffffffff8124a3a1>] do_vfs_ioctl+0xa1/0x5a0
[<ffffffff81036d29>] ? sched_clock+0x9/0x10
[<ffffffff8124a919>] SyS_ioctl+0x79/0x90
[<ffffffff817ac4b2>] entry_SYSCALL_64_fastpath+0x1a/0xa4
---[ end trace aef202839fe9a71d ]---
Uhhuh. NMI received for unknown reason 2d on CPU 2.
Do you have a strange power saving mode enabled?
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1457046448-6184-1-git-send-email-kan.liang@intel.com
[ Fixed various typos and other small details. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-03-04 07:07:28 +08:00
|
|
|
/* Only restore PMU state when it's active. See x86_pmu_disable(). */
|
2018-02-20 18:11:50 +08:00
|
|
|
cpuc->enabled = pmu_enabled;
|
|
|
|
if (pmu_enabled)
|
perf/x86/intel: Fix PEBS warning by only restoring active PMU in pmi
This patch tries to fix a PEBS warning found in my stress test. The
following perf command can easily trigger the pebs warning or spurious
NMI error on Skylake/Broadwell/Haswell platforms:
sudo perf record -e 'cpu/umask=0x04,event=0xc4/pp,cycles,branches,ref-cycles,cache-misses,cache-references' --call-graph fp -b -c1000 -a
Also the NMI watchdog must be enabled.
For this case, the events number is larger than counter number. So
perf has to do multiplexing.
In perf_mux_hrtimer_handler, it does perf_pmu_disable(), schedule out
old events, rotate_ctx, schedule in new events and finally
perf_pmu_enable().
If the old events include precise event, the MSR_IA32_PEBS_ENABLE
should be cleared when perf_pmu_disable(). The MSR_IA32_PEBS_ENABLE
should keep 0 until the perf_pmu_enable() is called and the new event is
precise event.
However, there is a corner case which could restore PEBS_ENABLE to
stale value during the above period. In perf_pmu_disable(), GLOBAL_CTRL
will be set to 0 to stop overflow and followed PMI. But there may be
pending PMI from an earlier overflow, which cannot be stopped. So even
GLOBAL_CTRL is cleared, the kernel still be possible to get PMI. At
the end of the PMI handler, __intel_pmu_enable_all() will be called,
which will restore the stale values if old events haven't scheduled
out.
Once the stale pebs value is set, it's impossible to be corrected if
the new events are non-precise. Because the pebs_enabled will be set
to 0. x86_pmu.enable_all() will ignore the MSR_IA32_PEBS_ENABLE
setting. As a result, the following NMI with stale PEBS_ENABLE
trigger pebs warning.
The pending PMI after enabled=0 will become harmless if the NMI handler
does not change the state. This patch checks cpuc->enabled in pmi and
only restore the state when PMU is active.
Here is the dump:
Call Trace:
<NMI> [<ffffffff813c3a2e>] dump_stack+0x63/0x85
[<ffffffff810a46f2>] warn_slowpath_common+0x82/0xc0
[<ffffffff810a483a>] warn_slowpath_null+0x1a/0x20
[<ffffffff8100fe2e>] intel_pmu_drain_pebs_nhm+0x2be/0x320
[<ffffffff8100caa9>] intel_pmu_handle_irq+0x279/0x460
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff811f290d>] ? vunmap_page_range+0x20d/0x330
[<ffffffff811f2f11>] ? unmap_kernel_range_noflush+0x11/0x20
[<ffffffff8148379f>] ? ghes_copy_tofrom_phys+0x10f/0x2a0
[<ffffffff814839c8>] ? ghes_read_estatus+0x98/0x170
[<ffffffff81005a7d>] perf_event_nmi_handler+0x2d/0x50
[<ffffffff810310b9>] nmi_handle+0x69/0x120
[<ffffffff810316f6>] default_do_nmi+0xe6/0x100
[<ffffffff810317f2>] do_nmi+0xe2/0x130
[<ffffffff817aea71>] end_repeat_nmi+0x1a/0x1e
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
<<EOE>> <IRQ> [<ffffffff81006df8>] ? x86_perf_event_set_period+0xd8/0x180
[<ffffffff81006eec>] x86_pmu_start+0x4c/0x100
[<ffffffff8100722d>] x86_pmu_enable+0x28d/0x300
[<ffffffff811994d7>] perf_pmu_enable.part.81+0x7/0x10
[<ffffffff8119cb70>] perf_mux_hrtimer_handler+0x200/0x280
[<ffffffff8119c970>] ? __perf_install_in_context+0xc0/0xc0
[<ffffffff8110f92d>] __hrtimer_run_queues+0xfd/0x280
[<ffffffff811100d8>] hrtimer_interrupt+0xa8/0x190
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81051bd8>] local_apic_timer_interrupt+0x38/0x60
[<ffffffff817af01d>] smp_apic_timer_interrupt+0x3d/0x50
[<ffffffff817ad15c>] apic_timer_interrupt+0x8c/0xa0
<EOI> [<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81123de5>] ? smp_call_function_single+0xd5/0x130
[<ffffffff81123ddb>] ? smp_call_function_single+0xcb/0x130
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff8119765a>] event_function_call+0x10a/0x120
[<ffffffff8119c660>] ? ctx_resched+0x90/0x90
[<ffffffff811971e0>] ? cpu_clock_event_read+0x30/0x30
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff8119772b>] _perf_event_enable+0x5b/0x70
[<ffffffff81197388>] perf_event_for_each_child+0x38/0xa0
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff811a0ffd>] perf_ioctl+0x12d/0x3c0
[<ffffffff8134d855>] ? selinux_file_ioctl+0x95/0x1e0
[<ffffffff8124a3a1>] do_vfs_ioctl+0xa1/0x5a0
[<ffffffff81036d29>] ? sched_clock+0x9/0x10
[<ffffffff8124a919>] SyS_ioctl+0x79/0x90
[<ffffffff817ac4b2>] entry_SYSCALL_64_fastpath+0x1a/0xa4
---[ end trace aef202839fe9a71d ]---
Uhhuh. NMI received for unknown reason 2d on CPU 2.
Do you have a strange power saving mode enabled?
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1457046448-6184-1-git-send-email-kan.liang@intel.com
[ Fixed various typos and other small details. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-03-04 07:07:28 +08:00
|
|
|
__intel_pmu_enable_all(0, true);
|
2016-09-15 16:22:33 +08:00
|
|
|
intel_bts_enable_local();
|
perf/x86/intel: Fix PEBS warning by only restoring active PMU in pmi
This patch tries to fix a PEBS warning found in my stress test. The
following perf command can easily trigger the pebs warning or spurious
NMI error on Skylake/Broadwell/Haswell platforms:
sudo perf record -e 'cpu/umask=0x04,event=0xc4/pp,cycles,branches,ref-cycles,cache-misses,cache-references' --call-graph fp -b -c1000 -a
Also the NMI watchdog must be enabled.
For this case, the events number is larger than counter number. So
perf has to do multiplexing.
In perf_mux_hrtimer_handler, it does perf_pmu_disable(), schedule out
old events, rotate_ctx, schedule in new events and finally
perf_pmu_enable().
If the old events include precise event, the MSR_IA32_PEBS_ENABLE
should be cleared when perf_pmu_disable(). The MSR_IA32_PEBS_ENABLE
should keep 0 until the perf_pmu_enable() is called and the new event is
precise event.
However, there is a corner case which could restore PEBS_ENABLE to
stale value during the above period. In perf_pmu_disable(), GLOBAL_CTRL
will be set to 0 to stop overflow and followed PMI. But there may be
pending PMI from an earlier overflow, which cannot be stopped. So even
GLOBAL_CTRL is cleared, the kernel still be possible to get PMI. At
the end of the PMI handler, __intel_pmu_enable_all() will be called,
which will restore the stale values if old events haven't scheduled
out.
Once the stale pebs value is set, it's impossible to be corrected if
the new events are non-precise. Because the pebs_enabled will be set
to 0. x86_pmu.enable_all() will ignore the MSR_IA32_PEBS_ENABLE
setting. As a result, the following NMI with stale PEBS_ENABLE
trigger pebs warning.
The pending PMI after enabled=0 will become harmless if the NMI handler
does not change the state. This patch checks cpuc->enabled in pmi and
only restore the state when PMU is active.
Here is the dump:
Call Trace:
<NMI> [<ffffffff813c3a2e>] dump_stack+0x63/0x85
[<ffffffff810a46f2>] warn_slowpath_common+0x82/0xc0
[<ffffffff810a483a>] warn_slowpath_null+0x1a/0x20
[<ffffffff8100fe2e>] intel_pmu_drain_pebs_nhm+0x2be/0x320
[<ffffffff8100caa9>] intel_pmu_handle_irq+0x279/0x460
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff811f290d>] ? vunmap_page_range+0x20d/0x330
[<ffffffff811f2f11>] ? unmap_kernel_range_noflush+0x11/0x20
[<ffffffff8148379f>] ? ghes_copy_tofrom_phys+0x10f/0x2a0
[<ffffffff814839c8>] ? ghes_read_estatus+0x98/0x170
[<ffffffff81005a7d>] perf_event_nmi_handler+0x2d/0x50
[<ffffffff810310b9>] nmi_handle+0x69/0x120
[<ffffffff810316f6>] default_do_nmi+0xe6/0x100
[<ffffffff810317f2>] do_nmi+0xe2/0x130
[<ffffffff817aea71>] end_repeat_nmi+0x1a/0x1e
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
<<EOE>> <IRQ> [<ffffffff81006df8>] ? x86_perf_event_set_period+0xd8/0x180
[<ffffffff81006eec>] x86_pmu_start+0x4c/0x100
[<ffffffff8100722d>] x86_pmu_enable+0x28d/0x300
[<ffffffff811994d7>] perf_pmu_enable.part.81+0x7/0x10
[<ffffffff8119cb70>] perf_mux_hrtimer_handler+0x200/0x280
[<ffffffff8119c970>] ? __perf_install_in_context+0xc0/0xc0
[<ffffffff8110f92d>] __hrtimer_run_queues+0xfd/0x280
[<ffffffff811100d8>] hrtimer_interrupt+0xa8/0x190
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81051bd8>] local_apic_timer_interrupt+0x38/0x60
[<ffffffff817af01d>] smp_apic_timer_interrupt+0x3d/0x50
[<ffffffff817ad15c>] apic_timer_interrupt+0x8c/0xa0
<EOI> [<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81123de5>] ? smp_call_function_single+0xd5/0x130
[<ffffffff81123ddb>] ? smp_call_function_single+0xcb/0x130
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff8119765a>] event_function_call+0x10a/0x120
[<ffffffff8119c660>] ? ctx_resched+0x90/0x90
[<ffffffff811971e0>] ? cpu_clock_event_read+0x30/0x30
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff8119772b>] _perf_event_enable+0x5b/0x70
[<ffffffff81197388>] perf_event_for_each_child+0x38/0xa0
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff811a0ffd>] perf_ioctl+0x12d/0x3c0
[<ffffffff8134d855>] ? selinux_file_ioctl+0x95/0x1e0
[<ffffffff8124a3a1>] do_vfs_ioctl+0xa1/0x5a0
[<ffffffff81036d29>] ? sched_clock+0x9/0x10
[<ffffffff8124a919>] SyS_ioctl+0x79/0x90
[<ffffffff817ac4b2>] entry_SYSCALL_64_fastpath+0x1a/0xa4
---[ end trace aef202839fe9a71d ]---
Uhhuh. NMI received for unknown reason 2d on CPU 2.
Do you have a strange power saving mode enabled?
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1457046448-6184-1-git-send-email-kan.liang@intel.com
[ Fixed various typos and other small details. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-03-04 07:07:28 +08:00
|
|
|
|
2013-06-18 08:36:50 +08:00
|
|
|
/*
|
|
|
|
* Only unmask the NMI after the overflow counters
|
|
|
|
* have been reset. This avoids spurious NMIs on
|
|
|
|
* Haswell CPUs.
|
|
|
|
*/
|
|
|
|
if (x86_pmu.late_ack)
|
|
|
|
apic_write(APIC_LVTPC, APIC_DM_NMI);
|
2010-09-03 03:07:49 +08:00
|
|
|
return handled;
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct event_constraint *
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
intel_bts_constraints(struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2018-11-21 18:16:11 +08:00
|
|
|
if (unlikely(intel_pmu_has_bts(event)))
|
2010-02-26 19:05:05 +08:00
|
|
|
return &bts_constraint;
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2020-06-13 16:09:49 +08:00
|
|
|
/*
|
|
|
|
* Note: matches a fake event, like Fixed2.
|
|
|
|
*/
|
|
|
|
static struct event_constraint *
|
|
|
|
intel_vlbr_constraints(struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct event_constraint *c = &vlbr_constraint;
|
|
|
|
|
|
|
|
if (unlikely(constraint_match(c, event->hw.config)))
|
|
|
|
return c;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2015-06-25 02:23:35 +08:00
|
|
|
static int intel_alt_er(int idx, u64 config)
|
2011-05-23 17:08:15 +08:00
|
|
|
{
|
2016-01-28 06:24:29 +08:00
|
|
|
int alt_idx = idx;
|
|
|
|
|
2014-11-18 03:06:53 +08:00
|
|
|
if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
|
2012-06-05 21:30:31 +08:00
|
|
|
return idx;
|
2011-05-23 17:08:15 +08:00
|
|
|
|
2012-06-05 21:30:31 +08:00
|
|
|
if (idx == EXTRA_REG_RSP_0)
|
2015-06-25 02:23:35 +08:00
|
|
|
alt_idx = EXTRA_REG_RSP_1;
|
2012-06-05 21:30:31 +08:00
|
|
|
|
|
|
|
if (idx == EXTRA_REG_RSP_1)
|
2015-06-25 02:23:35 +08:00
|
|
|
alt_idx = EXTRA_REG_RSP_0;
|
2012-06-05 21:30:31 +08:00
|
|
|
|
2015-06-25 02:23:35 +08:00
|
|
|
if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
|
|
|
|
return idx;
|
|
|
|
|
|
|
|
return alt_idx;
|
2012-06-05 21:30:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_fixup_er(struct perf_event *event, int idx)
|
|
|
|
{
|
|
|
|
event->hw.extra_reg.idx = idx;
|
|
|
|
|
|
|
|
if (idx == EXTRA_REG_RSP_0) {
|
2011-05-23 17:08:15 +08:00
|
|
|
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
|
2013-07-18 17:02:23 +08:00
|
|
|
event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
|
2011-05-23 17:08:15 +08:00
|
|
|
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
|
2012-06-05 21:30:31 +08:00
|
|
|
} else if (idx == EXTRA_REG_RSP_1) {
|
|
|
|
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
|
2013-07-18 17:02:23 +08:00
|
|
|
event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
|
2012-06-05 21:30:31 +08:00
|
|
|
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
|
2011-05-23 17:08:15 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
/*
|
|
|
|
* manage allocation of shared extra msr for certain events
|
|
|
|
*
|
|
|
|
* sharing can be:
|
|
|
|
* per-cpu: to be shared between the various events on a single PMU
|
|
|
|
* per-core: per-cpu + shared by HT threads
|
|
|
|
*/
|
2011-03-03 10:34:47 +08:00
|
|
|
static struct event_constraint *
|
2011-06-06 22:57:03 +08:00
|
|
|
__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
|
2012-02-10 06:20:53 +08:00
|
|
|
struct perf_event *event,
|
|
|
|
struct hw_perf_event_extra *reg)
|
2011-03-03 10:34:47 +08:00
|
|
|
{
|
2011-06-06 22:57:03 +08:00
|
|
|
struct event_constraint *c = &emptyconstraint;
|
2011-03-03 10:34:47 +08:00
|
|
|
struct er_account *era;
|
2011-06-06 22:57:08 +08:00
|
|
|
unsigned long flags;
|
2012-06-05 21:30:31 +08:00
|
|
|
int idx = reg->idx;
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2012-06-05 21:30:31 +08:00
|
|
|
/*
|
|
|
|
* reg->alloc can be set due to existing state, so for fake cpuc we
|
|
|
|
* need to ignore this, otherwise we might fail to allocate proper fake
|
|
|
|
* state for this extra reg constraint. Also see the comment below.
|
|
|
|
*/
|
|
|
|
if (reg->alloc && !cpuc->is_fake)
|
2012-02-10 06:20:53 +08:00
|
|
|
return NULL; /* call x86_get_event_constraint() */
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2011-05-23 17:08:15 +08:00
|
|
|
again:
|
2012-06-05 21:30:31 +08:00
|
|
|
era = &cpuc->shared_regs->regs[idx];
|
2011-06-06 22:57:08 +08:00
|
|
|
/*
|
|
|
|
* we use spin_lock_irqsave() to avoid lockdep issues when
|
|
|
|
* passing a fake cpuc
|
|
|
|
*/
|
|
|
|
raw_spin_lock_irqsave(&era->lock, flags);
|
2011-06-06 22:57:03 +08:00
|
|
|
|
|
|
|
if (!atomic_read(&era->ref) || era->config == reg->config) {
|
|
|
|
|
2012-06-05 21:30:31 +08:00
|
|
|
/*
|
|
|
|
* If its a fake cpuc -- as per validate_{group,event}() we
|
|
|
|
* shouldn't touch event state and we can avoid doing so
|
|
|
|
* since both will only call get_event_constraints() once
|
|
|
|
* on each event, this avoids the need for reg->alloc.
|
|
|
|
*
|
|
|
|
* Not doing the ER fixup will only result in era->reg being
|
|
|
|
* wrong, but since we won't actually try and program hardware
|
|
|
|
* this isn't a problem either.
|
|
|
|
*/
|
|
|
|
if (!cpuc->is_fake) {
|
|
|
|
if (idx != reg->idx)
|
|
|
|
intel_fixup_er(event, idx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* x86_schedule_events() can call get_event_constraints()
|
|
|
|
* multiple times on events in the case of incremental
|
|
|
|
* scheduling(). reg->alloc ensures we only do the ER
|
|
|
|
* allocation once.
|
|
|
|
*/
|
|
|
|
reg->alloc = 1;
|
|
|
|
}
|
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
/* lock in msr value */
|
|
|
|
era->config = reg->config;
|
|
|
|
era->reg = reg->reg;
|
|
|
|
|
|
|
|
/* one more user */
|
|
|
|
atomic_inc(&era->ref);
|
|
|
|
|
2011-03-03 10:34:47 +08:00
|
|
|
/*
|
2012-02-10 06:20:53 +08:00
|
|
|
* need to call x86_get_event_constraint()
|
|
|
|
* to check if associated event has constraints
|
2011-03-03 10:34:47 +08:00
|
|
|
*/
|
2012-02-10 06:20:53 +08:00
|
|
|
c = NULL;
|
2012-06-05 21:30:31 +08:00
|
|
|
} else {
|
2015-06-25 02:23:35 +08:00
|
|
|
idx = intel_alt_er(idx, reg->config);
|
2012-06-05 21:30:31 +08:00
|
|
|
if (idx != reg->idx) {
|
|
|
|
raw_spin_unlock_irqrestore(&era->lock, flags);
|
|
|
|
goto again;
|
|
|
|
}
|
2011-03-03 10:34:47 +08:00
|
|
|
}
|
2011-06-06 22:57:08 +08:00
|
|
|
raw_spin_unlock_irqrestore(&era->lock, flags);
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
|
|
|
|
struct hw_perf_event_extra *reg)
|
|
|
|
{
|
|
|
|
struct er_account *era;
|
|
|
|
|
|
|
|
/*
|
2012-06-05 21:30:31 +08:00
|
|
|
* Only put constraint if extra reg was actually allocated. Also takes
|
|
|
|
* care of event which do not use an extra shared reg.
|
|
|
|
*
|
|
|
|
* Also, if this is a fake cpuc we shouldn't touch any event state
|
|
|
|
* (reg->alloc) and we don't care about leaving inconsistent cpuc state
|
|
|
|
* either since it'll be thrown out.
|
2011-06-06 22:57:03 +08:00
|
|
|
*/
|
2012-06-05 21:30:31 +08:00
|
|
|
if (!reg->alloc || cpuc->is_fake)
|
2011-06-06 22:57:03 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
era = &cpuc->shared_regs->regs[reg->idx];
|
|
|
|
|
|
|
|
/* one fewer user */
|
|
|
|
atomic_dec(&era->ref);
|
|
|
|
|
|
|
|
/* allocate again next time */
|
|
|
|
reg->alloc = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct event_constraint *
|
|
|
|
intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
2012-02-10 06:20:53 +08:00
|
|
|
struct event_constraint *c = NULL, *d;
|
|
|
|
struct hw_perf_event_extra *xreg, *breg;
|
|
|
|
|
|
|
|
xreg = &event->hw.extra_reg;
|
|
|
|
if (xreg->idx != EXTRA_REG_NONE) {
|
|
|
|
c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
|
|
|
|
if (c == &emptyconstraint)
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
breg = &event->hw.branch_reg;
|
|
|
|
if (breg->idx != EXTRA_REG_NONE) {
|
|
|
|
d = __intel_shared_reg_get_constraints(cpuc, event, breg);
|
|
|
|
if (d == &emptyconstraint) {
|
|
|
|
__intel_shared_reg_put_constraints(cpuc, xreg);
|
|
|
|
c = d;
|
|
|
|
}
|
|
|
|
}
|
2011-06-06 22:57:03 +08:00
|
|
|
return c;
|
2011-03-03 10:34:47 +08:00
|
|
|
}
|
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
struct event_constraint *
|
2014-11-18 03:06:56 +08:00
|
|
|
x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
2011-08-31 07:41:05 +08:00
|
|
|
{
|
|
|
|
struct event_constraint *c;
|
|
|
|
|
|
|
|
if (x86_pmu.event_constraints) {
|
|
|
|
for_each_event_constraint(c, x86_pmu.event_constraints) {
|
2019-04-03 03:45:04 +08:00
|
|
|
if (constraint_match(c, event->hw.config)) {
|
2013-01-24 23:10:27 +08:00
|
|
|
event->hw.flags |= c->flags;
|
2011-08-31 07:41:05 +08:00
|
|
|
return c;
|
2013-01-24 23:10:27 +08:00
|
|
|
}
|
2011-08-31 07:41:05 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return &unconstrained;
|
|
|
|
}
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
static struct event_constraint *
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
2014-11-18 03:06:56 +08:00
|
|
|
struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
|
|
|
struct event_constraint *c;
|
|
|
|
|
2020-06-13 16:09:49 +08:00
|
|
|
c = intel_vlbr_constraints(event);
|
|
|
|
if (c)
|
|
|
|
return c;
|
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
c = intel_bts_constraints(event);
|
|
|
|
if (c)
|
|
|
|
return c;
|
|
|
|
|
2015-03-27 22:38:25 +08:00
|
|
|
c = intel_shared_regs_constraints(cpuc, event);
|
2010-02-26 19:05:05 +08:00
|
|
|
if (c)
|
|
|
|
return c;
|
|
|
|
|
2015-03-27 22:38:25 +08:00
|
|
|
c = intel_pebs_constraints(event);
|
2011-03-03 10:34:47 +08:00
|
|
|
if (c)
|
|
|
|
return c;
|
|
|
|
|
2014-11-18 03:06:56 +08:00
|
|
|
return x86_get_event_constraints(cpuc, idx, event);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
static void
|
|
|
|
intel_start_scheduling(struct cpu_hw_events *cpuc)
|
|
|
|
{
|
|
|
|
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
|
2015-05-21 16:57:21 +08:00
|
|
|
struct intel_excl_states *xl;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
int tid = cpuc->excl_thread_id;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* nothing needed if in group validation mode
|
|
|
|
*/
|
2014-11-18 03:07:04 +08:00
|
|
|
if (cpuc->is_fake || !is_ht_workaround_enabled())
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return;
|
2014-11-18 03:07:04 +08:00
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* no exclusion needed
|
|
|
|
*/
|
2015-05-21 16:57:28 +08:00
|
|
|
if (WARN_ON_ONCE(!excl_cntrs))
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
xl = &excl_cntrs->states[tid];
|
|
|
|
|
|
|
|
xl->sched_started = true;
|
|
|
|
/*
|
|
|
|
* lock shared state until we are done scheduling
|
|
|
|
* in stop_event_scheduling()
|
|
|
|
* makes scheduling appear as a transaction
|
|
|
|
*/
|
|
|
|
raw_spin_lock(&excl_cntrs->lock);
|
|
|
|
}
|
|
|
|
|
2015-05-21 16:57:32 +08:00
|
|
|
static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
|
|
|
|
{
|
|
|
|
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
|
|
|
|
struct event_constraint *c = cpuc->event_constraint[idx];
|
|
|
|
struct intel_excl_states *xl;
|
|
|
|
int tid = cpuc->excl_thread_id;
|
|
|
|
|
|
|
|
if (cpuc->is_fake || !is_ht_workaround_enabled())
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(!excl_cntrs))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
|
|
|
|
return;
|
|
|
|
|
|
|
|
xl = &excl_cntrs->states[tid];
|
|
|
|
|
|
|
|
lockdep_assert_held(&excl_cntrs->lock);
|
|
|
|
|
2015-05-21 16:57:36 +08:00
|
|
|
if (c->flags & PERF_X86_EVENT_EXCL)
|
2015-05-21 16:57:39 +08:00
|
|
|
xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
|
2015-05-21 16:57:36 +08:00
|
|
|
else
|
2015-05-21 16:57:39 +08:00
|
|
|
xl->state[cntr] = INTEL_EXCL_SHARED;
|
2015-05-21 16:57:32 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
static void
|
|
|
|
intel_stop_scheduling(struct cpu_hw_events *cpuc)
|
|
|
|
{
|
|
|
|
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
|
2015-05-21 16:57:21 +08:00
|
|
|
struct intel_excl_states *xl;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
int tid = cpuc->excl_thread_id;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* nothing needed if in group validation mode
|
|
|
|
*/
|
2014-11-18 03:07:04 +08:00
|
|
|
if (cpuc->is_fake || !is_ht_workaround_enabled())
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return;
|
|
|
|
/*
|
|
|
|
* no exclusion needed
|
|
|
|
*/
|
2015-05-21 16:57:28 +08:00
|
|
|
if (WARN_ON_ONCE(!excl_cntrs))
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
xl = &excl_cntrs->states[tid];
|
|
|
|
|
|
|
|
xl->sched_started = false;
|
|
|
|
/*
|
|
|
|
* release shared state lock (acquired in intel_start_scheduling())
|
|
|
|
*/
|
|
|
|
raw_spin_unlock(&excl_cntrs->lock);
|
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:16 +08:00
|
|
|
static struct event_constraint *
|
|
|
|
dyn_constraint(struct cpu_hw_events *cpuc, struct event_constraint *c, int idx)
|
|
|
|
{
|
|
|
|
WARN_ON_ONCE(!cpuc->constraint_list);
|
|
|
|
|
|
|
|
if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
|
|
|
|
struct event_constraint *cx;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* grab pre-allocated constraint entry
|
|
|
|
*/
|
|
|
|
cx = &cpuc->constraint_list[idx];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* initialize dynamic constraint
|
|
|
|
* with static constraint
|
|
|
|
*/
|
|
|
|
*cx = *c;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* mark constraint as dynamic
|
|
|
|
*/
|
|
|
|
cx->flags |= PERF_X86_EVENT_DYNAMIC;
|
|
|
|
c = cx;
|
|
|
|
}
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
static struct event_constraint *
|
|
|
|
intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
|
|
|
|
int idx, struct event_constraint *c)
|
|
|
|
{
|
|
|
|
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
|
2015-05-21 16:57:21 +08:00
|
|
|
struct intel_excl_states *xlo;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
int tid = cpuc->excl_thread_id;
|
2019-03-14 20:01:14 +08:00
|
|
|
int is_excl, i, w;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* validating a group does not require
|
|
|
|
* enforcing cross-thread exclusion
|
|
|
|
*/
|
2014-11-18 03:07:04 +08:00
|
|
|
if (cpuc->is_fake || !is_ht_workaround_enabled())
|
|
|
|
return c;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* no exclusion needed
|
|
|
|
*/
|
2015-05-21 16:57:28 +08:00
|
|
|
if (WARN_ON_ONCE(!excl_cntrs))
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return c;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* because we modify the constraint, we need
|
|
|
|
* to make a copy. Static constraints come
|
|
|
|
* from static const tables.
|
|
|
|
*
|
|
|
|
* only needed when constraint has not yet
|
|
|
|
* been cloned (marked dynamic)
|
|
|
|
*/
|
2019-03-06 05:23:16 +08:00
|
|
|
c = dyn_constraint(cpuc, c, idx);
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* From here on, the constraint is dynamic.
|
|
|
|
* Either it was just allocated above, or it
|
|
|
|
* was allocated during a earlier invocation
|
|
|
|
* of this function
|
|
|
|
*/
|
|
|
|
|
2015-05-21 16:57:21 +08:00
|
|
|
/*
|
|
|
|
* state of sibling HT
|
|
|
|
*/
|
|
|
|
xlo = &excl_cntrs->states[tid ^ 1];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* event requires exclusive counter access
|
|
|
|
* across HT threads
|
|
|
|
*/
|
|
|
|
is_excl = c->flags & PERF_X86_EVENT_EXCL;
|
|
|
|
if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
|
|
|
|
event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
|
|
|
|
if (!cpuc->n_excl++)
|
|
|
|
WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* Modify static constraint with current dynamic
|
|
|
|
* state of thread
|
|
|
|
*
|
|
|
|
* EXCLUSIVE: sibling counter measuring exclusive event
|
|
|
|
* SHARED : sibling counter measuring non-exclusive event
|
|
|
|
* UNUSED : sibling counter unused
|
|
|
|
*/
|
2019-03-14 20:01:14 +08:00
|
|
|
w = c->weight;
|
2015-05-21 16:57:24 +08:00
|
|
|
for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* exclusive event in sibling counter
|
|
|
|
* our corresponding counter cannot be used
|
|
|
|
* regardless of our event
|
|
|
|
*/
|
2019-03-14 20:01:14 +08:00
|
|
|
if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE) {
|
2015-05-21 16:57:24 +08:00
|
|
|
__clear_bit(i, c->idxmsk);
|
2019-03-14 20:01:14 +08:00
|
|
|
w--;
|
|
|
|
continue;
|
|
|
|
}
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* if measuring an exclusive event, sibling
|
|
|
|
* measuring non-exclusive, then counter cannot
|
|
|
|
* be used
|
|
|
|
*/
|
2019-03-14 20:01:14 +08:00
|
|
|
if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED) {
|
2015-05-21 16:57:24 +08:00
|
|
|
__clear_bit(i, c->idxmsk);
|
2019-03-14 20:01:14 +08:00
|
|
|
w--;
|
|
|
|
continue;
|
|
|
|
}
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* if we return an empty mask, then switch
|
|
|
|
* back to static empty constraint to avoid
|
|
|
|
* the cost of freeing later on
|
|
|
|
*/
|
2019-03-14 20:01:14 +08:00
|
|
|
if (!w)
|
2015-05-21 16:57:24 +08:00
|
|
|
c = &emptyconstraint;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
2019-03-14 20:01:14 +08:00
|
|
|
c->weight = w;
|
|
|
|
|
2015-05-21 16:57:24 +08:00
|
|
|
return c;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct event_constraint *
|
|
|
|
intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
2019-03-14 16:57:57 +08:00
|
|
|
struct event_constraint *c1, *c2;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
2019-03-14 16:57:57 +08:00
|
|
|
c1 = cpuc->event_constraint[idx];
|
2015-09-10 17:58:27 +08:00
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* first time only
|
|
|
|
* - static constraint: no change across incremental scheduling calls
|
|
|
|
* - dynamic constraint: handled by intel_get_excl_constraints()
|
|
|
|
*/
|
2014-11-18 03:07:01 +08:00
|
|
|
c2 = __intel_get_event_constraints(cpuc, idx, event);
|
2019-03-14 20:17:51 +08:00
|
|
|
if (c1) {
|
|
|
|
WARN_ON_ONCE(!(c1->flags & PERF_X86_EVENT_DYNAMIC));
|
2014-11-18 03:07:01 +08:00
|
|
|
bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
|
|
|
|
c1->weight = c2->weight;
|
|
|
|
c2 = c1;
|
|
|
|
}
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
if (cpuc->excl_cntrs)
|
2014-11-18 03:07:01 +08:00
|
|
|
return intel_get_excl_constraints(cpuc, event, idx, c2);
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
2014-11-18 03:07:01 +08:00
|
|
|
return c2;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
|
|
|
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
|
|
|
|
int tid = cpuc->excl_thread_id;
|
2015-05-21 16:57:21 +08:00
|
|
|
struct intel_excl_states *xl;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* nothing needed if in group validation mode
|
|
|
|
*/
|
|
|
|
if (cpuc->is_fake)
|
|
|
|
return;
|
|
|
|
|
2015-05-21 16:57:28 +08:00
|
|
|
if (WARN_ON_ONCE(!excl_cntrs))
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return;
|
|
|
|
|
2015-05-21 16:57:17 +08:00
|
|
|
if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
|
|
|
|
hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
|
|
|
|
if (!--cpuc->n_excl)
|
|
|
|
WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
|
|
|
|
}
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
2015-05-22 17:36:13 +08:00
|
|
|
* If event was actually assigned, then mark the counter state as
|
|
|
|
* unused now.
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
*/
|
2015-05-22 17:36:13 +08:00
|
|
|
if (hwc->idx >= 0) {
|
|
|
|
xl = &excl_cntrs->states[tid];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* put_constraint may be called from x86_schedule_events()
|
|
|
|
* which already has the lock held so here make locking
|
|
|
|
* conditional.
|
|
|
|
*/
|
|
|
|
if (!xl->sched_started)
|
|
|
|
raw_spin_lock(&excl_cntrs->lock);
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
2015-05-21 16:57:21 +08:00
|
|
|
xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
2015-05-22 17:36:13 +08:00
|
|
|
if (!xl->sched_started)
|
|
|
|
raw_spin_unlock(&excl_cntrs->lock);
|
|
|
|
}
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
static void
|
|
|
|
intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
|
2011-03-03 10:34:47 +08:00
|
|
|
struct perf_event *event)
|
|
|
|
{
|
2011-06-06 22:57:03 +08:00
|
|
|
struct hw_perf_event_extra *reg;
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
reg = &event->hw.extra_reg;
|
|
|
|
if (reg->idx != EXTRA_REG_NONE)
|
|
|
|
__intel_shared_reg_put_constraints(cpuc, reg);
|
2012-02-10 06:20:53 +08:00
|
|
|
|
|
|
|
reg = &event->hw.branch_reg;
|
|
|
|
if (reg->idx != EXTRA_REG_NONE)
|
|
|
|
__intel_shared_reg_put_constraints(cpuc, reg);
|
2011-06-06 22:57:03 +08:00
|
|
|
}
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
intel_put_shared_regs_event_constraints(cpuc, event);
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* is PMU has exclusive counter restrictions, then
|
|
|
|
* all events are subject to and must call the
|
|
|
|
* put_excl_constraints() routine
|
|
|
|
*/
|
2015-05-21 16:57:13 +08:00
|
|
|
if (cpuc->excl_cntrs)
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
intel_put_excl_constraints(cpuc, event);
|
|
|
|
}
|
|
|
|
|
2012-06-05 16:26:43 +08:00
|
|
|
static void intel_pebs_aliases_core2(struct perf_event *event)
|
2010-03-30 23:00:06 +08:00
|
|
|
{
|
2012-06-05 16:26:43 +08:00
|
|
|
if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
|
2010-12-15 04:26:40 +08:00
|
|
|
/*
|
|
|
|
* Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
|
|
|
|
* (0x003c) so that we can use it with PEBS.
|
|
|
|
*
|
|
|
|
* The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
|
|
|
|
* PEBS capable. However we can use INST_RETIRED.ANY_P
|
|
|
|
* (0x00c0), which is a PEBS capable event, to get the same
|
|
|
|
* count.
|
|
|
|
*
|
|
|
|
* INST_RETIRED.ANY_P counts the number of cycles that retires
|
|
|
|
* CNTMASK instructions. By setting CNTMASK to a value (16)
|
|
|
|
* larger than the maximum number of instructions that can be
|
|
|
|
* retired per cycle (4) and then inverting the condition, we
|
|
|
|
* count all cycles that retire 16 or less instructions, which
|
|
|
|
* is every cycle.
|
|
|
|
*
|
|
|
|
* Thereby we gain a PEBS capable cycle counter.
|
|
|
|
*/
|
2012-03-12 19:44:35 +08:00
|
|
|
u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
|
|
|
|
|
2012-06-05 16:26:43 +08:00
|
|
|
alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
|
|
|
|
event->hw.config = alt_config;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pebs_aliases_snb(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
|
|
|
|
/*
|
|
|
|
* Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
|
|
|
|
* (0x003c) so that we can use it with PEBS.
|
|
|
|
*
|
|
|
|
* The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
|
|
|
|
* PEBS capable. However we can use UOPS_RETIRED.ALL
|
|
|
|
* (0x01c2), which is a PEBS capable event, to get the same
|
|
|
|
* count.
|
|
|
|
*
|
|
|
|
* UOPS_RETIRED.ALL counts the number of cycles that retires
|
|
|
|
* CNTMASK micro-ops. By setting CNTMASK to a value (16)
|
|
|
|
* larger than the maximum number of micro-ops that can be
|
|
|
|
* retired per cycle (4) and then inverting the condition, we
|
|
|
|
* count all cycles that retire 16 or less micro-ops, which
|
|
|
|
* is every cycle.
|
|
|
|
*
|
|
|
|
* Thereby we gain a PEBS capable cycle counter.
|
|
|
|
*/
|
|
|
|
u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
|
2010-12-15 04:26:40 +08:00
|
|
|
|
|
|
|
alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
|
|
|
|
event->hw.config = alt_config;
|
|
|
|
}
|
2012-06-05 16:26:43 +08:00
|
|
|
}
|
|
|
|
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
static void intel_pebs_aliases_precdist(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
|
|
|
|
/*
|
|
|
|
* Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
|
|
|
|
* (0x003c) so that we can use it with PEBS.
|
|
|
|
*
|
|
|
|
* The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
|
|
|
|
* PEBS capable. However we can use INST_RETIRED.PREC_DIST
|
|
|
|
* (0x01c0), which is a PEBS capable event, to get the same
|
|
|
|
* count.
|
|
|
|
*
|
|
|
|
* The PREC_DIST event has special support to minimize sample
|
|
|
|
* shadowing effects. One drawback is that it can be
|
|
|
|
* only programmed on counter 1, but that seems like an
|
|
|
|
* acceptable trade off.
|
|
|
|
*/
|
|
|
|
u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
|
|
|
|
|
|
|
|
alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
|
|
|
|
event->hw.config = alt_config;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pebs_aliases_ivb(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (event->attr.precise_ip < 3)
|
|
|
|
return intel_pebs_aliases_snb(event);
|
|
|
|
return intel_pebs_aliases_precdist(event);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pebs_aliases_skl(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (event->attr.precise_ip < 3)
|
|
|
|
return intel_pebs_aliases_core2(event);
|
|
|
|
return intel_pebs_aliases_precdist(event);
|
|
|
|
}
|
|
|
|
|
2018-03-12 22:45:37 +08:00
|
|
|
static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event)
|
2015-05-28 12:13:14 +08:00
|
|
|
{
|
2018-03-12 22:45:37 +08:00
|
|
|
unsigned long flags = x86_pmu.large_pebs_flags;
|
2015-05-28 12:13:14 +08:00
|
|
|
|
|
|
|
if (event->attr.use_clockid)
|
|
|
|
flags &= ~PERF_SAMPLE_TIME;
|
2017-09-01 05:46:30 +08:00
|
|
|
if (!event->attr.exclude_kernel)
|
|
|
|
flags &= ~PERF_SAMPLE_REGS_USER;
|
2019-04-03 03:44:58 +08:00
|
|
|
if (event->attr.sample_regs_user & ~PEBS_GP_REGS)
|
2017-09-01 05:46:30 +08:00
|
|
|
flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR);
|
2015-05-28 12:13:14 +08:00
|
|
|
return flags;
|
|
|
|
}
|
|
|
|
|
2018-11-21 18:16:10 +08:00
|
|
|
static int intel_pmu_bts_config(struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct perf_event_attr *attr = &event->attr;
|
|
|
|
|
2018-11-21 18:16:11 +08:00
|
|
|
if (unlikely(intel_pmu_has_bts(event))) {
|
2018-11-21 18:16:10 +08:00
|
|
|
/* BTS is not supported by this architecture. */
|
|
|
|
if (!x86_pmu.bts_active)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
/* BTS is currently only allowed for user-mode. */
|
|
|
|
if (!attr->exclude_kernel)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
2018-11-21 18:16:12 +08:00
|
|
|
/* BTS is not allowed for precise events. */
|
|
|
|
if (attr->precise_ip)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
2018-11-21 18:16:10 +08:00
|
|
|
/* disallow bts if conflicting events are present */
|
|
|
|
if (x86_add_exclusive(x86_lbr_exclusive_lbr))
|
|
|
|
return -EBUSY;
|
|
|
|
|
|
|
|
event->destroy = hw_perf_lbr_event_destroy;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int core_pmu_hw_config(struct perf_event *event)
|
|
|
|
{
|
|
|
|
int ret = x86_pmu_hw_config(event);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
return intel_pmu_bts_config(event);
|
|
|
|
}
|
|
|
|
|
2012-06-05 16:26:43 +08:00
|
|
|
static int intel_pmu_hw_config(struct perf_event *event)
|
|
|
|
{
|
|
|
|
int ret = x86_pmu_hw_config(event);
|
|
|
|
|
2018-11-21 18:16:10 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = intel_pmu_bts_config(event);
|
2012-06-05 16:26:43 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2015-05-07 03:33:47 +08:00
|
|
|
if (event->attr.precise_ip) {
|
2019-05-14 08:34:00 +08:00
|
|
|
if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
|
2015-05-07 03:33:47 +08:00
|
|
|
event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
|
2015-05-28 12:13:14 +08:00
|
|
|
if (!(event->attr.sample_type &
|
2018-03-12 22:45:37 +08:00
|
|
|
~intel_pmu_large_pebs_flags(event)))
|
|
|
|
event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
|
perf/x86/intel: Implement batched PEBS interrupt handling (large PEBS interrupt threshold)
PEBS always had the capability to log samples to its buffers without
an interrupt. Traditionally perf has not used this but always set the
PEBS threshold to one.
For frequently occurring events (like cycles or branches or load/store)
this in term requires using a relatively high sampling period to avoid
overloading the system, by only processing PMIs. This in term increases
sampling error.
For the common cases we still need to use the PMI because the PEBS
hardware has various limitations. The biggest one is that it can not
supply a callgraph. It also requires setting a fixed period, as the
hardware does not support adaptive period. Another issue is that it
cannot supply a time stamp and some other options. To supply a TID it
requires flushing on context switch. It can however supply the IP, the
load/store address, TSX information, registers, and some other things.
So we can make PEBS work for some specific cases, basically as long as
you can do without a callgraph and can set the period you can use this
new PEBS mode.
The main benefit is the ability to support much lower sampling period
(down to -c 1000) without extensive overhead.
One use cases is for example to increase the resolution of the c2c tool.
Another is double checking when you suspect the standard sampling has
too much sampling error.
Some numbers on the overhead, using cycle soak, comparing the elapsed
time from "kernbench -M -H" between plain (threshold set to one) and
multi (large threshold).
The test command for plain:
"perf record --time -e cycles:p -c $period -- kernbench -M -H"
The test command for multi:
"perf record --no-time -e cycles:p -c $period -- kernbench -M -H"
( The only difference of test command between multi and plain is time
stamp options. Since time stamp is not supported by large PEBS
threshold, it can be used as a flag to indicate if large threshold is
enabled during the test. )
period plain(Sec) multi(Sec) Delta
10003 32.7 16.5 16.2
20003 30.2 16.2 14.0
40003 18.6 14.1 4.5
80003 16.8 14.6 2.2
100003 16.9 14.1 2.8
800003 15.4 15.7 -0.3
1000003 15.3 15.2 0.2
2000003 15.3 15.1 0.1
With periods below 100003, plain (threshold one) cause much more
overhead. With 10003 sampling period, the Elapsed Time for multi is
even 2X faster than plain.
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1430940834-8964-5-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-07 03:33:50 +08:00
|
|
|
}
|
2015-05-07 03:33:47 +08:00
|
|
|
if (x86_pmu.pebs_aliases)
|
|
|
|
x86_pmu.pebs_aliases(event);
|
2018-05-10 21:48:41 +08:00
|
|
|
|
|
|
|
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
|
|
|
|
event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY;
|
2015-05-07 03:33:47 +08:00
|
|
|
}
|
2010-12-15 04:26:40 +08:00
|
|
|
|
2014-11-05 10:56:06 +08:00
|
|
|
if (needs_branch_stack(event)) {
|
2012-02-10 06:20:57 +08:00
|
|
|
ret = intel_pmu_setup_lbr_filter(event);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2015-01-14 20:18:20 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* BTS is set up earlier in this path, so don't account twice
|
|
|
|
*/
|
2018-11-21 18:16:11 +08:00
|
|
|
if (!unlikely(intel_pmu_has_bts(event))) {
|
2015-01-14 20:18:20 +08:00
|
|
|
/* disallow lbr if conflicting events are present */
|
|
|
|
if (x86_add_exclusive(x86_lbr_exclusive_lbr))
|
|
|
|
return -EBUSY;
|
|
|
|
|
|
|
|
event->destroy = hw_perf_lbr_event_destroy;
|
|
|
|
}
|
2012-02-10 06:20:57 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Support PEBS output to PT
If PEBS declares ability to output its data to Intel PT stream, use the
aux_output attribute bit to enable PEBS data output to PT. This requires
a PT event to be present and scheduled in the same context. Unlike the
DS area, the kernel does not extract PEBS records from the PT stream to
generate corresponding records in the perf stream, because that would
require real time in-kernel PT decoding, which is not feasible. The PMI,
however, can still be used.
The output setting is per-CPU, so all PEBS events must be either writing
to PT or to the DS area, therefore, in case of conflict, the conflicting
event will fail to schedule, allowing the rotation logic to alternate
between the PEBS->PT and PEBS->DS events.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: kan.liang@linux.intel.com
Link: https://lkml.kernel.org/r/20190806084606.4021-3-alexander.shishkin@linux.intel.com
2019-08-06 16:46:01 +08:00
|
|
|
if (event->attr.aux_output) {
|
|
|
|
if (!event->attr.precise_ip)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT;
|
|
|
|
}
|
|
|
|
|
2010-03-30 23:00:06 +08:00
|
|
|
if (event->attr.type != PERF_TYPE_RAW)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (x86_pmu.version < 3)
|
|
|
|
return -EINVAL;
|
|
|
|
|
perf_event: Add support for LSM and SELinux checks
In current mainline, the degree of access to perf_event_open(2) system
call depends on the perf_event_paranoid sysctl. This has a number of
limitations:
1. The sysctl is only a single value. Many types of accesses are controlled
based on the single value thus making the control very limited and
coarse grained.
2. The sysctl is global, so if the sysctl is changed, then that means
all processes get access to perf_event_open(2) opening the door to
security issues.
This patch adds LSM and SELinux access checking which will be used in
Android to access perf_event_open(2) for the purposes of attaching BPF
programs to tracepoints, perf profiling and other operations from
userspace. These operations are intended for production systems.
5 new LSM hooks are added:
1. perf_event_open: This controls access during the perf_event_open(2)
syscall itself. The hook is called from all the places that the
perf_event_paranoid sysctl is checked to keep it consistent with the
systctl. The hook gets passed a 'type' argument which controls CPU,
kernel and tracepoint accesses (in this context, CPU, kernel and
tracepoint have the same semantics as the perf_event_paranoid sysctl).
Additionally, I added an 'open' type which is similar to
perf_event_paranoid sysctl == 3 patch carried in Android and several other
distros but was rejected in mainline [1] in 2016.
2. perf_event_alloc: This allocates a new security object for the event
which stores the current SID within the event. It will be useful when
the perf event's FD is passed through IPC to another process which may
try to read the FD. Appropriate security checks will limit access.
3. perf_event_free: Called when the event is closed.
4. perf_event_read: Called from the read(2) and mmap(2) syscalls for the event.
5. perf_event_write: Called from the ioctl(2) syscalls for the event.
[1] https://lwn.net/Articles/696240/
Since Peter had suggest LSM hooks in 2016 [1], I am adding his
Suggested-by tag below.
To use this patch, we set the perf_event_paranoid sysctl to -1 and then
apply selinux checking as appropriate (default deny everything, and then
add policy rules to give access to domains that need it). In the future
we can remove the perf_event_paranoid sysctl altogether.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Co-developed-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: James Morris <jmorris@namei.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: rostedt@goodmis.org
Cc: Yonghong Song <yhs@fb.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: jeffv@google.com
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: primiano@google.com
Cc: Song Liu <songliubraving@fb.com>
Cc: rsavitski@google.com
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Matthew Garrett <matthewgarrett@google.com>
Link: https://lkml.kernel.org/r/20191014170308.70668-1-joel@joelfernandes.org
2019-10-15 01:03:08 +08:00
|
|
|
ret = perf_allow_cpu(&event->attr);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2010-03-30 23:00:06 +08:00
|
|
|
|
|
|
|
event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-11-05 07:00:01 +08:00
|
|
|
#ifdef CONFIG_RETPOLINE
|
|
|
|
static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr);
|
|
|
|
static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr);
|
|
|
|
#endif
|
|
|
|
|
2011-10-05 20:01:21 +08:00
|
|
|
struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
|
|
|
|
{
|
2019-11-05 07:00:01 +08:00
|
|
|
#ifdef CONFIG_RETPOLINE
|
|
|
|
if (x86_pmu.guest_get_msrs == intel_guest_get_msrs)
|
|
|
|
return intel_guest_get_msrs(nr);
|
|
|
|
else if (x86_pmu.guest_get_msrs == core_guest_get_msrs)
|
|
|
|
return core_guest_get_msrs(nr);
|
|
|
|
#endif
|
2011-10-05 20:01:21 +08:00
|
|
|
if (x86_pmu.guest_get_msrs)
|
|
|
|
return x86_pmu.guest_get_msrs(nr);
|
|
|
|
*nr = 0;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
|
|
|
|
|
|
|
|
static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
|
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2011-10-05 20:01:21 +08:00
|
|
|
struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
|
|
|
|
|
|
|
|
arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
|
|
|
|
arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
|
|
|
|
arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
|
2019-02-05 06:23:30 +08:00
|
|
|
if (x86_pmu.flags & PMU_FL_PEBS_ALL)
|
|
|
|
arr[0].guest &= ~cpuc->pebs_enabled;
|
|
|
|
else
|
|
|
|
arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
|
|
|
|
*nr = 1;
|
|
|
|
|
|
|
|
if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) {
|
|
|
|
/*
|
|
|
|
* If PMU counter has PEBS enabled it is not enough to
|
|
|
|
* disable counter on a guest entry since PEBS memory
|
|
|
|
* write can overshoot guest entry and corrupt guest
|
|
|
|
* memory. Disabling PEBS solves the problem.
|
|
|
|
*
|
|
|
|
* Don't do this if the CPU already enforces it.
|
|
|
|
*/
|
|
|
|
arr[1].msr = MSR_IA32_PEBS_ENABLE;
|
|
|
|
arr[1].host = cpuc->pebs_enabled;
|
|
|
|
arr[1].guest = 0;
|
|
|
|
*nr = 2;
|
|
|
|
}
|
2011-10-05 20:01:21 +08:00
|
|
|
|
|
|
|
return arr;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
|
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2011-10-05 20:01:21 +08:00
|
|
|
struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
|
|
|
|
int idx;
|
|
|
|
|
|
|
|
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
|
|
|
|
struct perf_event *event = cpuc->events[idx];
|
|
|
|
|
|
|
|
arr[idx].msr = x86_pmu_config_addr(idx);
|
|
|
|
arr[idx].host = arr[idx].guest = 0;
|
|
|
|
|
|
|
|
if (!test_bit(idx, cpuc->active_mask))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
arr[idx].host = arr[idx].guest =
|
|
|
|
event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
|
|
|
|
|
|
|
|
if (event->attr.exclude_host)
|
|
|
|
arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
|
|
|
|
else if (event->attr.exclude_guest)
|
|
|
|
arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
|
|
|
|
}
|
|
|
|
|
|
|
|
*nr = x86_pmu.num_counters;
|
|
|
|
return arr;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void core_pmu_enable_event(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (!event->attr.exclude_host)
|
|
|
|
x86_pmu_enable_event(event);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void core_pmu_enable_all(int added)
|
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2011-10-05 20:01:21 +08:00
|
|
|
int idx;
|
|
|
|
|
|
|
|
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
|
|
|
|
struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
|
|
|
|
|
|
|
|
if (!test_bit(idx, cpuc->active_mask) ||
|
|
|
|
cpuc->events[idx]->attr.exclude_host)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
static int hsw_hw_config(struct perf_event *event)
|
|
|
|
{
|
|
|
|
int ret = intel_pmu_hw_config(event);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
|
|
|
|
return 0;
|
|
|
|
event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
|
|
|
|
* PEBS or in ANY thread mode. Since the results are non-sensical forbid
|
|
|
|
* this combination.
|
|
|
|
*/
|
|
|
|
if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
|
|
|
|
((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
|
|
|
|
event->attr.precise_ip > 0))
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
2013-09-06 11:37:38 +08:00
|
|
|
if (event_is_checkpointed(event)) {
|
|
|
|
/*
|
|
|
|
* Sampling of checkpointed events can cause situations where
|
|
|
|
* the CPU constantly aborts because of a overflow, which is
|
|
|
|
* then checkpointed back and ignored. Forbid checkpointing
|
|
|
|
* for sampling.
|
|
|
|
*
|
|
|
|
* But still allow a long sampling period, so that perf stat
|
|
|
|
* from KVM works.
|
|
|
|
*/
|
|
|
|
if (event->attr.sample_period > 0 &&
|
|
|
|
event->attr.sample_period < 0x7fffffff)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
2013-06-18 08:36:48 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-07-12 21:44:23 +08:00
|
|
|
static struct event_constraint counter0_constraint =
|
|
|
|
INTEL_ALL_EVENT_CONSTRAINT(0, 0x1);
|
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
static struct event_constraint counter2_constraint =
|
|
|
|
EVENT_CONSTRAINT(0, 0x4, 0);
|
|
|
|
|
2019-04-03 03:45:05 +08:00
|
|
|
static struct event_constraint fixed0_constraint =
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0);
|
|
|
|
|
2019-04-11 02:57:09 +08:00
|
|
|
static struct event_constraint fixed0_counter0_constraint =
|
|
|
|
INTEL_ALL_EVENT_CONSTRAINT(0, 0x100000001ULL);
|
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
static struct event_constraint *
|
2014-11-18 03:06:56 +08:00
|
|
|
hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
2013-06-18 08:36:48 +08:00
|
|
|
{
|
2014-11-18 03:06:56 +08:00
|
|
|
struct event_constraint *c;
|
|
|
|
|
|
|
|
c = intel_get_event_constraints(cpuc, idx, event);
|
2013-06-18 08:36:48 +08:00
|
|
|
|
|
|
|
/* Handle special quirk on in_tx_checkpointed only in counter 2 */
|
|
|
|
if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
|
|
|
|
if (c->idxmsk64 & (1U << 2))
|
|
|
|
return &counter2_constraint;
|
|
|
|
return &emptyconstraint;
|
|
|
|
}
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2019-04-03 03:45:05 +08:00
|
|
|
static struct event_constraint *
|
|
|
|
icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Fixed counter 0 has less skid.
|
|
|
|
* Force instruction:ppp in Fixed counter 0
|
|
|
|
*/
|
|
|
|
if ((event->attr.precise_ip == 3) &&
|
|
|
|
constraint_match(&fixed0_constraint, event->hw.config))
|
|
|
|
return &fixed0_constraint;
|
|
|
|
|
|
|
|
return hsw_get_event_constraints(cpuc, idx, event);
|
|
|
|
}
|
|
|
|
|
2017-07-12 21:44:23 +08:00
|
|
|
static struct event_constraint *
|
|
|
|
glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct event_constraint *c;
|
|
|
|
|
|
|
|
/* :ppp means to do reduced skid PEBS which is PMC0 only. */
|
|
|
|
if (event->attr.precise_ip == 3)
|
|
|
|
return &counter0_constraint;
|
|
|
|
|
|
|
|
c = intel_get_event_constraints(cpuc, idx, event);
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2019-04-11 02:57:09 +08:00
|
|
|
static struct event_constraint *
|
|
|
|
tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct event_constraint *c;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* :ppp means to do reduced skid PEBS,
|
|
|
|
* which is available on PMC0 and fixed counter 0.
|
|
|
|
*/
|
|
|
|
if (event->attr.precise_ip == 3) {
|
|
|
|
/* Force instruction:ppp on PMC0 and Fixed counter 0 */
|
|
|
|
if (constraint_match(&fixed0_constraint, event->hw.config))
|
|
|
|
return &fixed0_counter0_constraint;
|
|
|
|
|
|
|
|
return &counter0_constraint;
|
|
|
|
}
|
|
|
|
|
|
|
|
c = intel_get_event_constraints(cpuc, idx, event);
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:18 +08:00
|
|
|
static bool allow_tsx_force_abort = true;
|
|
|
|
|
|
|
|
static struct event_constraint *
|
|
|
|
tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct event_constraint *c = hsw_get_event_constraints(cpuc, idx, event);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Without TFA we must not use PMC3.
|
|
|
|
*/
|
2019-03-14 16:57:57 +08:00
|
|
|
if (!allow_tsx_force_abort && test_bit(3, c->idxmsk)) {
|
2019-03-06 05:23:18 +08:00
|
|
|
c = dyn_constraint(cpuc, c, idx);
|
|
|
|
c->idxmsk64 &= ~(1ULL << 3);
|
|
|
|
c->weight--;
|
|
|
|
}
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Add INST_RETIRED.ALL workarounds
On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.
This is erratum BDM11 and BDM55:
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.
Add a new callback to enforce this, and set it for Broadwell.
How does this handle the case when an app requests a specific
period with some of the bottom bits set?
Short answer:
Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.
Long answer (by Peterz):
IFF we guarantee perf_event_attr::sample_period >= 128.
Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.
So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.
So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.
So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-02-18 10:18:06 +08:00
|
|
|
/*
|
|
|
|
* Broadwell:
|
|
|
|
*
|
|
|
|
* The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
|
|
|
|
* (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
|
|
|
|
* the two to enforce a minimum period of 128 (the smallest value that has bits
|
|
|
|
* 0-5 cleared and >= 100).
|
|
|
|
*
|
|
|
|
* Because of how the code in x86_perf_event_set_period() works, the truncation
|
|
|
|
* of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
|
|
|
|
* to make up for the 'lost' events due to carrying the 'error' in period_left.
|
|
|
|
*
|
|
|
|
* Therefore the effective (average) period matches the requested period,
|
|
|
|
* despite coarser hardware granularity.
|
|
|
|
*/
|
2018-03-02 01:54:54 +08:00
|
|
|
static u64 bdw_limit_period(struct perf_event *event, u64 left)
|
perf/x86/intel: Add INST_RETIRED.ALL workarounds
On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.
This is erratum BDM11 and BDM55:
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.
Add a new callback to enforce this, and set it for Broadwell.
How does this handle the case when an app requests a specific
period with some of the bottom bits set?
Short answer:
Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.
Long answer (by Peterz):
IFF we guarantee perf_event_attr::sample_period >= 128.
Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.
So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.
So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.
So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-02-18 10:18:06 +08:00
|
|
|
{
|
|
|
|
if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
|
|
|
|
X86_CONFIG(.event=0xc0, .umask=0x01)) {
|
|
|
|
if (left < 128)
|
|
|
|
left = 128;
|
2018-03-17 19:52:16 +08:00
|
|
|
left &= ~0x3fULL;
|
perf/x86/intel: Add INST_RETIRED.ALL workarounds
On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.
This is erratum BDM11 and BDM55:
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.
Add a new callback to enforce this, and set it for Broadwell.
How does this handle the case when an app requests a specific
period with some of the bottom bits set?
Short answer:
Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.
Long answer (by Peterz):
IFF we guarantee perf_event_attr::sample_period >= 128.
Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.
So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.
So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.
So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-02-18 10:18:06 +08:00
|
|
|
}
|
|
|
|
return left;
|
|
|
|
}
|
|
|
|
|
2019-08-20 07:13:31 +08:00
|
|
|
static u64 nhm_limit_period(struct perf_event *event, u64 left)
|
|
|
|
{
|
|
|
|
return max(left, 32ULL);
|
|
|
|
}
|
|
|
|
|
2012-03-16 03:09:14 +08:00
|
|
|
PMU_FORMAT_ATTR(event, "config:0-7" );
|
|
|
|
PMU_FORMAT_ATTR(umask, "config:8-15" );
|
|
|
|
PMU_FORMAT_ATTR(edge, "config:18" );
|
|
|
|
PMU_FORMAT_ATTR(pc, "config:19" );
|
|
|
|
PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */
|
|
|
|
PMU_FORMAT_ATTR(inv, "config:23" );
|
|
|
|
PMU_FORMAT_ATTR(cmask, "config:24-31" );
|
2013-06-18 08:36:48 +08:00
|
|
|
PMU_FORMAT_ATTR(in_tx, "config:32");
|
|
|
|
PMU_FORMAT_ATTR(in_tx_cp, "config:33");
|
2012-03-16 03:09:14 +08:00
|
|
|
|
|
|
|
static struct attribute *intel_arch_formats_attr[] = {
|
|
|
|
&format_attr_event.attr,
|
|
|
|
&format_attr_umask.attr,
|
|
|
|
&format_attr_edge.attr,
|
|
|
|
&format_attr_pc.attr,
|
|
|
|
&format_attr_inv.attr,
|
|
|
|
&format_attr_cmask.attr,
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2012-10-10 20:53:14 +08:00
|
|
|
ssize_t intel_event_sysfs_show(char *page, u64 config)
|
|
|
|
{
|
|
|
|
u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
|
|
|
|
|
|
|
|
return x86_event_sysfs_show(page, config, event);
|
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:15 +08:00
|
|
|
static struct intel_shared_regs *allocate_shared_regs(int cpu)
|
2011-06-06 22:57:03 +08:00
|
|
|
{
|
|
|
|
struct intel_shared_regs *regs;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
regs = kzalloc_node(sizeof(struct intel_shared_regs),
|
|
|
|
GFP_KERNEL, cpu_to_node(cpu));
|
|
|
|
if (regs) {
|
|
|
|
/*
|
|
|
|
* initialize the locks to keep lockdep happy
|
|
|
|
*/
|
|
|
|
for (i = 0; i < EXTRA_REG_MAX; i++)
|
|
|
|
raw_spin_lock_init(®s->regs[i].lock);
|
|
|
|
|
|
|
|
regs->core_id = -1;
|
|
|
|
}
|
|
|
|
return regs;
|
|
|
|
}
|
|
|
|
|
2014-11-18 03:06:57 +08:00
|
|
|
static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
|
|
|
|
{
|
|
|
|
struct intel_excl_cntrs *c;
|
|
|
|
|
|
|
|
c = kzalloc_node(sizeof(struct intel_excl_cntrs),
|
|
|
|
GFP_KERNEL, cpu_to_node(cpu));
|
|
|
|
if (c) {
|
|
|
|
raw_spin_lock_init(&c->lock);
|
|
|
|
c->core_id = -1;
|
|
|
|
}
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2019-03-06 05:23:15 +08:00
|
|
|
int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
|
|
|
|
{
|
perf/x86/intel: Support adaptive PEBS v4
Adaptive PEBS is a new way to report PEBS sampling information. Instead
of a fixed size record for all PEBS events it allows to configure the
PEBS record to only include the information needed. Events can then opt
in to use such an extended record, or stay with a basic record which
only contains the IP.
The major new feature is to support LBRs in PEBS record.
Besides normal LBR, this allows (much faster) large PEBS, while still
supporting callstacks through callstack LBR. So essentially a lot of
profiling can now be done without frequent interrupts, dropping the
overhead significantly.
The main requirement still is to use a period, and not use frequency
mode, because frequency mode requires reevaluating the frequency on each
overflow.
The floating point state (XMM) is also supported, which allows efficient
profiling of FP function arguments.
Introduce specific drain function to handle variable length records.
Use a new callback to parse the new record format, and also handle the
STATUS field now being at a different offset.
Add code to set up the configuration register. Since there is only a
single register, all events either get the full super set of all events,
or only the basic record.
Originally-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Cc: jolsa@kernel.org
Link: https://lkml.kernel.org/r/20190402194509.2832-6-kan.liang@linux.intel.com
[ Renamed GPRS => GP. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-04-03 03:45:02 +08:00
|
|
|
cpuc->pebs_record_size = x86_pmu.pebs_record_size;
|
|
|
|
|
2014-11-18 03:06:57 +08:00
|
|
|
if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
|
|
|
|
cpuc->shared_regs = allocate_shared_regs(cpu);
|
|
|
|
if (!cpuc->shared_regs)
|
2015-08-10 20:17:34 +08:00
|
|
|
goto err;
|
2014-11-18 03:06:57 +08:00
|
|
|
}
|
2011-03-03 10:34:50 +08:00
|
|
|
|
2019-03-06 05:23:18 +08:00
|
|
|
if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA)) {
|
2014-11-18 03:06:57 +08:00
|
|
|
size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
|
|
|
|
|
2019-03-06 05:23:15 +08:00
|
|
|
cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
|
2014-11-18 03:06:57 +08:00
|
|
|
if (!cpuc->constraint_list)
|
2015-08-10 20:17:34 +08:00
|
|
|
goto err_shared_regs;
|
2019-03-06 05:23:18 +08:00
|
|
|
}
|
2014-11-18 03:06:57 +08:00
|
|
|
|
2019-03-06 05:23:18 +08:00
|
|
|
if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
|
2014-11-18 03:06:57 +08:00
|
|
|
cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
|
2015-08-10 20:17:34 +08:00
|
|
|
if (!cpuc->excl_cntrs)
|
|
|
|
goto err_constraint_list;
|
|
|
|
|
2014-11-18 03:06:57 +08:00
|
|
|
cpuc->excl_thread_id = 0;
|
|
|
|
}
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2016-07-14 01:16:10 +08:00
|
|
|
return 0;
|
2015-08-10 20:17:34 +08:00
|
|
|
|
|
|
|
err_constraint_list:
|
|
|
|
kfree(cpuc->constraint_list);
|
|
|
|
cpuc->constraint_list = NULL;
|
|
|
|
|
|
|
|
err_shared_regs:
|
|
|
|
kfree(cpuc->shared_regs);
|
|
|
|
cpuc->shared_regs = NULL;
|
|
|
|
|
|
|
|
err:
|
2016-07-14 01:16:10 +08:00
|
|
|
return -ENOMEM;
|
2011-03-03 10:34:47 +08:00
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:15 +08:00
|
|
|
static int intel_pmu_cpu_prepare(int cpu)
|
|
|
|
{
|
|
|
|
return intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu);
|
|
|
|
}
|
|
|
|
|
2017-05-12 22:51:13 +08:00
|
|
|
static void flip_smm_bit(void *data)
|
|
|
|
{
|
|
|
|
unsigned long set = *(unsigned long *)data;
|
|
|
|
|
|
|
|
if (set > 0) {
|
|
|
|
msr_set_bit(MSR_IA32_DEBUGCTLMSR,
|
|
|
|
DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
|
|
|
|
} else {
|
|
|
|
msr_clear_bit(MSR_IA32_DEBUGCTLMSR,
|
|
|
|
DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-03-05 20:49:35 +08:00
|
|
|
static void intel_pmu_cpu_starting(int cpu)
|
|
|
|
{
|
2011-03-03 10:34:47 +08:00
|
|
|
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
|
|
|
|
int core_id = topology_core_id(cpu);
|
|
|
|
int i;
|
|
|
|
|
2011-03-03 10:34:50 +08:00
|
|
|
init_debug_store_on_cpu(cpu);
|
|
|
|
/*
|
|
|
|
* Deal with CPUs that don't clear their LBRs on power-up.
|
|
|
|
*/
|
|
|
|
intel_pmu_lbr_reset();
|
|
|
|
|
2012-02-10 06:20:53 +08:00
|
|
|
cpuc->lbr_sel = NULL;
|
|
|
|
|
2019-03-21 20:38:49 +08:00
|
|
|
if (x86_pmu.flags & PMU_FL_TFA) {
|
|
|
|
WARN_ON_ONCE(cpuc->tfa_shadow);
|
|
|
|
cpuc->tfa_shadow = ~0ULL;
|
|
|
|
intel_set_tfa(cpuc, false);
|
|
|
|
}
|
|
|
|
|
2018-04-26 02:57:17 +08:00
|
|
|
if (x86_pmu.version > 1)
|
|
|
|
flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
|
2017-05-12 22:51:13 +08:00
|
|
|
|
2018-08-08 15:12:07 +08:00
|
|
|
if (x86_pmu.counter_freezing)
|
|
|
|
enable_counter_freeze();
|
|
|
|
|
2012-02-10 06:20:53 +08:00
|
|
|
if (!cpuc->shared_regs)
|
2011-03-03 10:34:50 +08:00
|
|
|
return;
|
|
|
|
|
2014-11-18 03:06:53 +08:00
|
|
|
if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
|
2015-05-26 21:11:28 +08:00
|
|
|
for_each_cpu(i, topology_sibling_cpumask(cpu)) {
|
2012-02-10 06:20:53 +08:00
|
|
|
struct intel_shared_regs *pc;
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2012-02-10 06:20:53 +08:00
|
|
|
pc = per_cpu(cpu_hw_events, i).shared_regs;
|
|
|
|
if (pc && pc->core_id == core_id) {
|
2016-01-28 06:31:09 +08:00
|
|
|
cpuc->kfree_on_online[0] = cpuc->shared_regs;
|
2012-02-10 06:20:53 +08:00
|
|
|
cpuc->shared_regs = pc;
|
|
|
|
break;
|
|
|
|
}
|
2011-03-03 10:34:47 +08:00
|
|
|
}
|
2012-02-10 06:20:53 +08:00
|
|
|
cpuc->shared_regs->core_id = core_id;
|
|
|
|
cpuc->shared_regs->refcnt++;
|
2011-03-03 10:34:47 +08:00
|
|
|
}
|
|
|
|
|
2012-02-10 06:20:53 +08:00
|
|
|
if (x86_pmu.lbr_sel_map)
|
|
|
|
cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
|
2014-11-18 03:06:57 +08:00
|
|
|
|
|
|
|
if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
|
2015-05-26 21:11:28 +08:00
|
|
|
for_each_cpu(i, topology_sibling_cpumask(cpu)) {
|
2017-01-16 11:21:11 +08:00
|
|
|
struct cpu_hw_events *sibling;
|
2014-11-18 03:06:57 +08:00
|
|
|
struct intel_excl_cntrs *c;
|
|
|
|
|
2017-01-16 11:21:11 +08:00
|
|
|
sibling = &per_cpu(cpu_hw_events, i);
|
|
|
|
c = sibling->excl_cntrs;
|
2014-11-18 03:06:57 +08:00
|
|
|
if (c && c->core_id == core_id) {
|
|
|
|
cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
|
|
|
|
cpuc->excl_cntrs = c;
|
2017-01-16 11:21:11 +08:00
|
|
|
if (!sibling->excl_thread_id)
|
|
|
|
cpuc->excl_thread_id = 1;
|
2014-11-18 03:06:57 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
cpuc->excl_cntrs->core_id = core_id;
|
|
|
|
cpuc->excl_cntrs->refcnt++;
|
|
|
|
}
|
2010-03-05 20:49:35 +08:00
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:15 +08:00
|
|
|
static void free_excl_cntrs(struct cpu_hw_events *cpuc)
|
2010-03-05 20:49:35 +08:00
|
|
|
{
|
2014-11-18 03:06:57 +08:00
|
|
|
struct intel_excl_cntrs *c;
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2014-11-18 03:06:57 +08:00
|
|
|
c = cpuc->excl_cntrs;
|
|
|
|
if (c) {
|
|
|
|
if (c->core_id == -1 || --c->refcnt == 0)
|
|
|
|
kfree(c);
|
|
|
|
cpuc->excl_cntrs = NULL;
|
|
|
|
}
|
2019-03-06 05:23:18 +08:00
|
|
|
|
|
|
|
kfree(cpuc->constraint_list);
|
|
|
|
cpuc->constraint_list = NULL;
|
2014-11-18 03:07:04 +08:00
|
|
|
}
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2014-11-18 03:07:04 +08:00
|
|
|
static void intel_pmu_cpu_dying(int cpu)
|
2018-12-20 00:53:50 +08:00
|
|
|
{
|
|
|
|
fini_debug_store_on_cpu(cpu);
|
|
|
|
|
|
|
|
if (x86_pmu.counter_freezing)
|
|
|
|
disable_counter_freeze();
|
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:15 +08:00
|
|
|
void intel_cpuc_finish(struct cpu_hw_events *cpuc)
|
2014-11-18 03:07:04 +08:00
|
|
|
{
|
|
|
|
struct intel_shared_regs *pc;
|
|
|
|
|
|
|
|
pc = cpuc->shared_regs;
|
|
|
|
if (pc) {
|
|
|
|
if (pc->core_id == -1 || --pc->refcnt == 0)
|
|
|
|
kfree(pc);
|
|
|
|
cpuc->shared_regs = NULL;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
2019-03-06 05:23:15 +08:00
|
|
|
free_excl_cntrs(cpuc);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pmu_cpu_dead(int cpu)
|
|
|
|
{
|
|
|
|
intel_cpuc_finish(&per_cpu(cpu_hw_events, cpu));
|
2010-03-05 20:49:35 +08:00
|
|
|
}
|
|
|
|
|
2015-05-07 03:33:51 +08:00
|
|
|
static void intel_pmu_sched_task(struct perf_event_context *ctx,
|
|
|
|
bool sched_in)
|
|
|
|
{
|
perf/x86/intel: Add proper condition to run sched_task callbacks
We have 2 functions using the same sched_task callback:
- PEBS drain for free running counters
- LBR save/store
Both of them are called from intel_pmu_sched_task() and
either of them can be unwillingly triggered when the
other one is configured to run.
Let's say there's PEBS drain configured in sched_task
callback for the event, but in the callback itself
(intel_pmu_sched_task()) we will also run the code for
LBR save/restore, which we did not ask for, but the
code in intel_pmu_sched_task() does not check for that.
This can lead to extra cycles in some perf monitoring,
like when we monitor PEBS event without LBR data.
# perf record --no-timestamp -c 10000 -e cycles:p ./perf bench sched pipe -l 1000000
(We need PEBS, non freq/non timestamp event to enable
the sched_task callback)
The perf stat of cycles and msr:write_msr for above
command before the change:
...
Performance counter stats for './perf record --no-timestamp -c 10000 -e cycles:p \
./perf bench sched pipe -l 1000000' (5 runs):
18,519,557,441 cycles:k
91,195,527 msr:write_msr
29.334476406 seconds time elapsed
And after the change:
...
Performance counter stats for './perf record --no-timestamp -c 10000 -e cycles:p \
./perf bench sched pipe -l 1000000' (5 runs):
18,704,973,540 cycles:k
27,184,720 msr:write_msr
16.977875900 seconds time elapsed
There's no affect on cycles:k because the sched_task happens
with events switched off, however the msr:write_msr tracepoint
counter together with almost 50% of time speedup show the
improvement.
Monitoring LBR event and having extra PEBS drain processing
in sched_task callback showed just a little speedup, because
the drain function does not do much extra work in case there
is no PEBS data.
Adding conditions to recognize the configured work that needs
to be done in the x86_pmu's sched_task callback.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/20170719075247.GA27506@krava
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-19 15:52:47 +08:00
|
|
|
intel_pmu_pebs_sched_task(ctx, sched_in);
|
|
|
|
intel_pmu_lbr_sched_task(ctx, sched_in);
|
2015-05-07 03:33:51 +08:00
|
|
|
}
|
|
|
|
|
2019-10-23 15:13:56 +08:00
|
|
|
static void intel_pmu_swap_task_ctx(struct perf_event_context *prev,
|
|
|
|
struct perf_event_context *next)
|
|
|
|
{
|
|
|
|
intel_pmu_lbr_swap_task_ctx(prev, next);
|
|
|
|
}
|
|
|
|
|
2019-02-04 20:35:32 +08:00
|
|
|
static int intel_pmu_check_period(struct perf_event *event, u64 value)
|
|
|
|
{
|
|
|
|
return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0;
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Support PEBS output to PT
If PEBS declares ability to output its data to Intel PT stream, use the
aux_output attribute bit to enable PEBS data output to PT. This requires
a PT event to be present and scheduled in the same context. Unlike the
DS area, the kernel does not extract PEBS records from the PT stream to
generate corresponding records in the perf stream, because that would
require real time in-kernel PT decoding, which is not feasible. The PMI,
however, can still be used.
The output setting is per-CPU, so all PEBS events must be either writing
to PT or to the DS area, therefore, in case of conflict, the conflicting
event will fail to schedule, allowing the rotation logic to alternate
between the PEBS->PT and PEBS->DS events.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: kan.liang@linux.intel.com
Link: https://lkml.kernel.org/r/20190806084606.4021-3-alexander.shishkin@linux.intel.com
2019-08-06 16:46:01 +08:00
|
|
|
static int intel_pmu_aux_output_match(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (!x86_pmu.intel_cap.pebs_output_pt_available)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return is_intel_pt_event(event);
|
|
|
|
}
|
|
|
|
|
2012-03-16 03:09:14 +08:00
|
|
|
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
|
|
|
|
|
2013-01-24 23:10:33 +08:00
|
|
|
PMU_FORMAT_ATTR(ldlat, "config1:0-15");
|
|
|
|
|
2015-09-10 05:53:59 +08:00
|
|
|
PMU_FORMAT_ATTR(frontend, "config1:0-23");
|
|
|
|
|
2012-03-16 03:09:14 +08:00
|
|
|
static struct attribute *intel_arch3_formats_attr[] = {
|
|
|
|
&format_attr_event.attr,
|
|
|
|
&format_attr_umask.attr,
|
|
|
|
&format_attr_edge.attr,
|
|
|
|
&format_attr_pc.attr,
|
|
|
|
&format_attr_any.attr,
|
|
|
|
&format_attr_inv.attr,
|
|
|
|
&format_attr_cmask.attr,
|
2017-08-23 02:52:00 +08:00
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct attribute *hsw_format_attr[] = {
|
2013-06-18 08:36:48 +08:00
|
|
|
&format_attr_in_tx.attr,
|
|
|
|
&format_attr_in_tx_cp.attr,
|
2017-08-23 02:52:00 +08:00
|
|
|
&format_attr_offcore_rsp.attr,
|
|
|
|
&format_attr_ldlat.attr,
|
|
|
|
NULL
|
|
|
|
};
|
2012-03-16 03:09:14 +08:00
|
|
|
|
2017-08-23 02:52:00 +08:00
|
|
|
static struct attribute *nhm_format_attr[] = {
|
|
|
|
&format_attr_offcore_rsp.attr,
|
|
|
|
&format_attr_ldlat.attr,
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct attribute *slm_format_attr[] = {
|
|
|
|
&format_attr_offcore_rsp.attr,
|
|
|
|
NULL
|
2012-03-16 03:09:14 +08:00
|
|
|
};
|
|
|
|
|
2015-09-10 05:53:59 +08:00
|
|
|
static struct attribute *skl_format_attr[] = {
|
|
|
|
&format_attr_frontend.attr,
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2015-04-21 23:26:23 +08:00
|
|
|
static __initconst const struct x86_pmu core_pmu = {
|
|
|
|
.name = "core",
|
|
|
|
.handle_irq = x86_pmu_handle_irq,
|
|
|
|
.disable_all = x86_pmu_disable_all,
|
|
|
|
.enable_all = core_pmu_enable_all,
|
|
|
|
.enable = core_pmu_enable_event,
|
|
|
|
.disable = x86_pmu_disable_event,
|
2018-11-21 18:16:10 +08:00
|
|
|
.hw_config = core_pmu_hw_config,
|
2015-04-21 23:26:23 +08:00
|
|
|
.schedule_events = x86_schedule_events,
|
|
|
|
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
|
|
|
|
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
|
|
|
|
.event_map = intel_pmu_event_map,
|
|
|
|
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
|
|
|
|
.apic = 1,
|
2018-03-12 22:45:37 +08:00
|
|
|
.large_pebs_flags = LARGE_PEBS_FLAGS,
|
2015-05-28 12:13:14 +08:00
|
|
|
|
2015-04-21 23:26:23 +08:00
|
|
|
/*
|
|
|
|
* Intel PMCs cannot be accessed sanely above 32-bit width,
|
|
|
|
* so we install an artificial 1<<31 period regardless of
|
|
|
|
* the generic event period:
|
|
|
|
*/
|
|
|
|
.max_period = (1ULL<<31) - 1,
|
|
|
|
.get_event_constraints = intel_get_event_constraints,
|
|
|
|
.put_event_constraints = intel_put_event_constraints,
|
|
|
|
.event_constraints = intel_core_event_constraints,
|
|
|
|
.guest_get_msrs = core_guest_get_msrs,
|
|
|
|
.format_attrs = intel_arch_formats_attr,
|
|
|
|
.events_sysfs_show = intel_event_sysfs_show,
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Virtual (or funny metal) CPU can define x86_pmu.extra_regs
|
|
|
|
* together with PMU version 1 and thus be using core_pmu with
|
|
|
|
* shared_regs. We need following callbacks here to allocate
|
|
|
|
* it properly.
|
|
|
|
*/
|
|
|
|
.cpu_prepare = intel_pmu_cpu_prepare,
|
|
|
|
.cpu_starting = intel_pmu_cpu_starting,
|
|
|
|
.cpu_dying = intel_pmu_cpu_dying,
|
2018-12-20 00:53:50 +08:00
|
|
|
.cpu_dead = intel_pmu_cpu_dead,
|
2019-02-04 20:35:32 +08:00
|
|
|
|
|
|
|
.check_period = intel_pmu_check_period,
|
2020-07-03 20:49:08 +08:00
|
|
|
|
|
|
|
.lbr_reset = intel_pmu_lbr_reset_64,
|
2020-07-03 20:49:09 +08:00
|
|
|
.lbr_read = intel_pmu_lbr_read_64,
|
2020-07-03 20:49:10 +08:00
|
|
|
.lbr_save = intel_pmu_lbr_save,
|
|
|
|
.lbr_restore = intel_pmu_lbr_restore,
|
2015-04-21 23:26:23 +08:00
|
|
|
};
|
|
|
|
|
2010-03-29 19:09:53 +08:00
|
|
|
static __initconst const struct x86_pmu intel_pmu = {
|
2010-02-26 19:05:05 +08:00
|
|
|
.name = "Intel",
|
|
|
|
.handle_irq = intel_pmu_handle_irq,
|
|
|
|
.disable_all = intel_pmu_disable_all,
|
|
|
|
.enable_all = intel_pmu_enable_all,
|
|
|
|
.enable = intel_pmu_enable_event,
|
|
|
|
.disable = intel_pmu_disable_event,
|
perf/x86: Ensure perf_sched_cb_{inc,dec}() is only called from pmu::{add,del}()
Currently perf_sched_cb_{inc,dec}() are called from
pmu::{start,stop}(), which has the problem that this can happen from
NMI context, this is making it hard to optimize perf_pmu_sched_task().
Furthermore, we really only need this accounting on pmu::{add,del}(),
so doing it from pmu::{start,stop}() is doing more work than we really
need.
Introduce x86_pmu::{add,del}() and wire up the LBR and PEBS.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-07 00:02:43 +08:00
|
|
|
.add = intel_pmu_add_event,
|
|
|
|
.del = intel_pmu_del_event,
|
2018-02-13 06:20:34 +08:00
|
|
|
.read = intel_pmu_read_event,
|
2010-03-30 23:00:06 +08:00
|
|
|
.hw_config = intel_pmu_hw_config,
|
2010-03-12 00:54:39 +08:00
|
|
|
.schedule_events = x86_schedule_events,
|
2010-02-26 19:05:05 +08:00
|
|
|
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
|
|
|
|
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
|
|
|
|
.event_map = intel_pmu_event_map,
|
|
|
|
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
|
|
|
|
.apic = 1,
|
2018-03-12 22:45:37 +08:00
|
|
|
.large_pebs_flags = LARGE_PEBS_FLAGS,
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
|
|
|
* Intel PMCs cannot be accessed sanely above 32 bit width,
|
|
|
|
* so we install an artificial 1<<31 period regardless of
|
|
|
|
* the generic event period:
|
|
|
|
*/
|
|
|
|
.max_period = (1ULL << 31) - 1,
|
2010-03-05 20:01:18 +08:00
|
|
|
.get_event_constraints = intel_get_event_constraints,
|
2011-03-03 10:34:47 +08:00
|
|
|
.put_event_constraints = intel_put_event_constraints,
|
2012-06-05 16:26:43 +08:00
|
|
|
.pebs_aliases = intel_pebs_aliases_core2,
|
2010-03-05 20:01:18 +08:00
|
|
|
|
2012-03-16 03:09:14 +08:00
|
|
|
.format_attrs = intel_arch3_formats_attr,
|
2012-10-10 20:53:14 +08:00
|
|
|
.events_sysfs_show = intel_event_sysfs_show,
|
2012-03-16 03:09:14 +08:00
|
|
|
|
2011-03-03 10:34:47 +08:00
|
|
|
.cpu_prepare = intel_pmu_cpu_prepare,
|
2010-03-05 20:49:35 +08:00
|
|
|
.cpu_starting = intel_pmu_cpu_starting,
|
|
|
|
.cpu_dying = intel_pmu_cpu_dying,
|
2018-12-20 00:53:50 +08:00
|
|
|
.cpu_dead = intel_pmu_cpu_dead,
|
|
|
|
|
2011-10-05 20:01:21 +08:00
|
|
|
.guest_get_msrs = intel_guest_get_msrs,
|
2015-05-07 03:33:51 +08:00
|
|
|
.sched_task = intel_pmu_sched_task,
|
2019-10-23 15:13:56 +08:00
|
|
|
.swap_task_ctx = intel_pmu_swap_task_ctx,
|
2019-02-04 20:35:32 +08:00
|
|
|
|
|
|
|
.check_period = intel_pmu_check_period,
|
perf/x86/intel: Support PEBS output to PT
If PEBS declares ability to output its data to Intel PT stream, use the
aux_output attribute bit to enable PEBS data output to PT. This requires
a PT event to be present and scheduled in the same context. Unlike the
DS area, the kernel does not extract PEBS records from the PT stream to
generate corresponding records in the perf stream, because that would
require real time in-kernel PT decoding, which is not feasible. The PMI,
however, can still be used.
The output setting is per-CPU, so all PEBS events must be either writing
to PT or to the DS area, therefore, in case of conflict, the conflicting
event will fail to schedule, allowing the rotation logic to alternate
between the PEBS->PT and PEBS->DS events.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: kan.liang@linux.intel.com
Link: https://lkml.kernel.org/r/20190806084606.4021-3-alexander.shishkin@linux.intel.com
2019-08-06 16:46:01 +08:00
|
|
|
|
|
|
|
.aux_output_match = intel_pmu_aux_output_match,
|
2020-07-03 20:49:08 +08:00
|
|
|
|
|
|
|
.lbr_reset = intel_pmu_lbr_reset_64,
|
2020-07-03 20:49:09 +08:00
|
|
|
.lbr_read = intel_pmu_lbr_read_64,
|
2020-07-03 20:49:10 +08:00
|
|
|
.lbr_save = intel_pmu_lbr_save,
|
|
|
|
.lbr_restore = intel_pmu_lbr_restore,
|
2010-02-26 19:05:05 +08:00
|
|
|
};
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
static __init void intel_clovertown_quirk(void)
|
2010-03-05 04:49:01 +08:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* PEBS is unreliable due to:
|
|
|
|
*
|
|
|
|
* AJ67 - PEBS may experience CPL leaks
|
|
|
|
* AJ68 - PEBS PMI may be delayed by one event
|
|
|
|
* AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
|
|
|
|
* AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
|
|
|
|
*
|
|
|
|
* AJ67 could be worked around by restricting the OS/USR flags.
|
|
|
|
* AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
|
|
|
|
*
|
|
|
|
* AJ106 could possibly be worked around by not allowing LBR
|
|
|
|
* usage from PEBS, including the fixup.
|
|
|
|
* AJ68 could possibly be worked around by always programming
|
2011-04-27 17:51:41 +08:00
|
|
|
* a pebs_event_reset[0] value and coping with the lost events.
|
2010-03-05 04:49:01 +08:00
|
|
|
*
|
|
|
|
* But taken together it might just make sense to not enable PEBS on
|
|
|
|
* these chips.
|
|
|
|
*/
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_warn("PEBS disabled due to CPU errata\n");
|
2010-03-05 04:49:01 +08:00
|
|
|
x86_pmu.pebs = 0;
|
|
|
|
x86_pmu.pebs_constraints = NULL;
|
|
|
|
}
|
|
|
|
|
2019-02-05 06:23:30 +08:00
|
|
|
static const struct x86_cpu_desc isolation_ucodes[] = {
|
2019-08-28 03:48:21 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_HASWELL, 3, 0x0000001f),
|
2019-08-28 03:48:22 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_HASWELL_L, 1, 0x0000001e),
|
2019-08-28 03:48:23 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_HASWELL_G, 1, 0x00000015),
|
2019-02-05 06:23:30 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_HASWELL_X, 2, 0x00000037),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_HASWELL_X, 4, 0x0000000a),
|
2019-08-28 03:48:21 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL, 4, 0x00000023),
|
2019-08-28 03:48:23 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_G, 1, 0x00000014),
|
2019-08-28 03:48:24 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 2, 0x00000010),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 3, 0x07000009),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 4, 0x0f000009),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 5, 0x0e000002),
|
2019-02-05 06:23:30 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000),
|
2019-08-28 03:48:22 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L, 3, 0x0000007c),
|
2019-08-28 03:48:21 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE, 3, 0x0000007c),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 9, 0x0000004e),
|
2019-08-28 03:48:22 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 9, 0x0000004e),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 10, 0x0000004e),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 11, 0x0000004e),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 12, 0x0000004e),
|
2019-08-28 03:48:21 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 10, 0x0000004e),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 11, 0x0000004e),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 12, 0x0000004e),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 13, 0x0000004e),
|
2019-02-05 06:23:30 +08:00
|
|
|
{}
|
|
|
|
};
|
|
|
|
|
|
|
|
static void intel_check_pebs_isolation(void)
|
|
|
|
{
|
|
|
|
x86_pmu.pebs_no_isolation = !x86_cpu_has_min_microcode_rev(isolation_ucodes);
|
|
|
|
}
|
|
|
|
|
|
|
|
static __init void intel_pebs_isolation_quirk(void)
|
|
|
|
{
|
|
|
|
WARN_ON_ONCE(x86_pmu.check_microcode);
|
|
|
|
x86_pmu.check_microcode = intel_check_pebs_isolation;
|
|
|
|
intel_check_pebs_isolation();
|
|
|
|
}
|
|
|
|
|
2019-02-05 06:23:31 +08:00
|
|
|
static const struct x86_cpu_desc pebs_ucodes[] = {
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SANDYBRIDGE, 7, 0x00000028),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SANDYBRIDGE_X, 6, 0x00000618),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_SANDYBRIDGE_X, 7, 0x0000070c),
|
|
|
|
{}
|
|
|
|
};
|
2012-06-08 20:50:50 +08:00
|
|
|
|
2019-02-05 06:23:31 +08:00
|
|
|
static bool intel_snb_pebs_broken(void)
|
|
|
|
{
|
|
|
|
return !x86_cpu_has_min_microcode_rev(pebs_ucodes);
|
2012-06-08 20:50:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_snb_check_microcode(void)
|
|
|
|
{
|
2019-02-05 06:23:31 +08:00
|
|
|
if (intel_snb_pebs_broken() == x86_pmu.pebs_broken)
|
2012-06-08 20:50:50 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Serialized by the microcode lock..
|
|
|
|
*/
|
|
|
|
if (x86_pmu.pebs_broken) {
|
|
|
|
pr_info("PEBS enabled due to microcode update\n");
|
|
|
|
x86_pmu.pebs_broken = 0;
|
|
|
|
} else {
|
|
|
|
pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
|
|
|
|
x86_pmu.pebs_broken = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-06-22 02:31:11 +08:00
|
|
|
static bool is_lbr_from(unsigned long msr)
|
|
|
|
{
|
|
|
|
unsigned long lbr_from_nr = x86_pmu.lbr_from + x86_pmu.lbr_nr;
|
|
|
|
|
|
|
|
return x86_pmu.lbr_from <= msr && msr < lbr_from_nr;
|
|
|
|
}
|
|
|
|
|
2014-07-15 03:25:56 +08:00
|
|
|
/*
|
|
|
|
* Under certain circumstances, access certain MSR may cause #GP.
|
|
|
|
* The function tests if the input MSR can be safely accessed.
|
|
|
|
*/
|
|
|
|
static bool check_msr(unsigned long msr, u64 mask)
|
|
|
|
{
|
|
|
|
u64 val_old, val_new, val_tmp;
|
|
|
|
|
2019-06-16 22:13:13 +08:00
|
|
|
/*
|
|
|
|
* Disable the check for real HW, so we don't
|
|
|
|
* mess with potentionaly enabled registers:
|
|
|
|
*/
|
2019-07-25 10:39:26 +08:00
|
|
|
if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
|
2019-06-16 22:13:13 +08:00
|
|
|
return true;
|
|
|
|
|
2014-07-15 03:25:56 +08:00
|
|
|
/*
|
|
|
|
* Read the current value, change it and read it back to see if it
|
|
|
|
* matches, this is needed to detect certain hardware emulators
|
|
|
|
* (qemu/kvm) that don't trap on the MSR access and always return 0s.
|
|
|
|
*/
|
|
|
|
if (rdmsrl_safe(msr, &val_old))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only change the bits which can be updated by wrmsrl.
|
|
|
|
*/
|
|
|
|
val_tmp = val_old ^ mask;
|
2016-06-22 02:31:11 +08:00
|
|
|
|
|
|
|
if (is_lbr_from(msr))
|
|
|
|
val_tmp = lbr_from_signext_quirk_wr(val_tmp);
|
|
|
|
|
2014-07-15 03:25:56 +08:00
|
|
|
if (wrmsrl_safe(msr, val_tmp) ||
|
|
|
|
rdmsrl_safe(msr, &val_new))
|
|
|
|
return false;
|
|
|
|
|
2016-06-22 02:31:11 +08:00
|
|
|
/*
|
|
|
|
* Quirk only affects validation in wrmsr(), so wrmsrl()'s value
|
|
|
|
* should equal rdmsrl()'s even with the quirk.
|
|
|
|
*/
|
2014-07-15 03:25:56 +08:00
|
|
|
if (val_new != val_tmp)
|
|
|
|
return false;
|
|
|
|
|
2016-06-22 02:31:11 +08:00
|
|
|
if (is_lbr_from(msr))
|
|
|
|
val_old = lbr_from_signext_quirk_wr(val_old);
|
|
|
|
|
2014-07-15 03:25:56 +08:00
|
|
|
/* Here it's sure that the MSR can be safely accessed.
|
|
|
|
* Restore the old value and return.
|
|
|
|
*/
|
|
|
|
wrmsrl(msr, val_old);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
static __init void intel_sandybridge_quirk(void)
|
2011-11-15 17:51:15 +08:00
|
|
|
{
|
2012-06-08 20:50:50 +08:00
|
|
|
x86_pmu.check_microcode = intel_snb_check_microcode;
|
2017-05-24 16:15:30 +08:00
|
|
|
cpus_read_lock();
|
2012-06-08 20:50:50 +08:00
|
|
|
intel_snb_check_microcode();
|
2017-05-24 16:15:30 +08:00
|
|
|
cpus_read_unlock();
|
2011-11-15 17:51:15 +08:00
|
|
|
}
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
|
|
|
|
{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
|
|
|
|
{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
|
|
|
|
{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
|
|
|
|
{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
|
|
|
|
{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
|
|
|
|
{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
|
|
|
|
{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
|
2011-11-10 20:57:26 +08:00
|
|
|
};
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
static __init void intel_arch_events_quirk(void)
|
|
|
|
{
|
|
|
|
int bit;
|
|
|
|
|
|
|
|
/* disable event that reported as not presend by cpuid */
|
|
|
|
for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
|
|
|
|
intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_warn("CPUID marked event: \'%s\' unavailable\n",
|
|
|
|
intel_arch_events_map[bit].name);
|
2011-12-06 21:07:15 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static __init void intel_nehalem_quirk(void)
|
|
|
|
{
|
|
|
|
union cpuid10_ebx ebx;
|
|
|
|
|
|
|
|
ebx.full = x86_pmu.events_maskl;
|
|
|
|
if (ebx.split.no_branch_misses_retired) {
|
|
|
|
/*
|
|
|
|
* Erratum AAJ80 detected, we work it around by using
|
|
|
|
* the BR_MISP_EXEC.ANY event. This will over-count
|
|
|
|
* branch-misses, but it's still much better than the
|
|
|
|
* architectural event which is often completely bogus:
|
|
|
|
*/
|
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
|
|
|
|
ebx.split.no_branch_misses_retired = 0;
|
|
|
|
x86_pmu.events_maskl = ebx.full;
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_info("CPU erratum AAJ80 worked around\n");
|
2011-12-06 21:07:15 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-02-05 06:23:32 +08:00
|
|
|
static const struct x86_cpu_desc counter_freezing_ucodes[] = {
|
2019-02-05 06:23:33 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 2, 0x0000000e),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 9, 0x0000002e),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 10, 0x00000008),
|
2019-08-28 03:48:24 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_D, 1, 0x00000028),
|
2019-02-05 06:23:33 +08:00
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 1, 0x00000028),
|
|
|
|
INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 8, 0x00000006),
|
2019-02-05 06:23:32 +08:00
|
|
|
{}
|
|
|
|
};
|
2018-08-08 15:12:08 +08:00
|
|
|
|
2019-02-05 06:23:32 +08:00
|
|
|
static bool intel_counter_freezing_broken(void)
|
|
|
|
{
|
|
|
|
return !x86_cpu_has_min_microcode_rev(counter_freezing_ucodes);
|
2018-08-08 15:12:08 +08:00
|
|
|
}
|
|
|
|
|
2019-02-05 06:23:32 +08:00
|
|
|
static __init void intel_counter_freezing_quirk(void)
|
2018-08-08 15:12:08 +08:00
|
|
|
{
|
|
|
|
/* Check if it's already disabled */
|
|
|
|
if (disable_counter_freezing)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the system starts with the wrong ucode, leave the
|
|
|
|
* counter-freezing feature permanently disabled.
|
|
|
|
*/
|
2019-02-05 06:23:32 +08:00
|
|
|
if (intel_counter_freezing_broken()) {
|
2018-08-08 15:12:08 +08:00
|
|
|
pr_info("PMU counter freezing disabled due to CPU errata,"
|
|
|
|
"please upgrade microcode\n");
|
|
|
|
x86_pmu.counter_freezing = false;
|
|
|
|
x86_pmu.handle_irq = intel_pmu_handle_irq;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-18 03:06:59 +08:00
|
|
|
/*
|
|
|
|
* enable software workaround for errata:
|
|
|
|
* SNB: BJ122
|
|
|
|
* IVB: BV98
|
|
|
|
* HSW: HSD29
|
|
|
|
*
|
|
|
|
* Only needed when HT is enabled. However detecting
|
2014-11-18 03:07:04 +08:00
|
|
|
* if HT is enabled is difficult (model specific). So instead,
|
|
|
|
* we enable the workaround in the early boot, and verify if
|
|
|
|
* it is needed in a later initcall phase once we have valid
|
|
|
|
* topology information to check if HT is actually enabled
|
2014-11-18 03:06:59 +08:00
|
|
|
*/
|
|
|
|
static __init void intel_ht_bug(void)
|
|
|
|
{
|
2014-11-18 03:07:04 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
|
2014-11-18 03:06:59 +08:00
|
|
|
|
|
|
|
x86_pmu.start_scheduling = intel_start_scheduling;
|
2015-05-21 16:57:32 +08:00
|
|
|
x86_pmu.commit_scheduling = intel_commit_scheduling;
|
2014-11-18 03:06:59 +08:00
|
|
|
x86_pmu.stop_scheduling = intel_stop_scheduling;
|
|
|
|
}
|
|
|
|
|
2013-09-13 01:17:00 +08:00
|
|
|
EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
|
|
|
|
EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")
|
2013-06-18 08:36:52 +08:00
|
|
|
|
2013-09-06 11:37:40 +08:00
|
|
|
/* Haswell special events */
|
2013-09-13 01:17:00 +08:00
|
|
|
EVENT_ATTR_STR(tx-start, tx_start, "event=0xc9,umask=0x1");
|
|
|
|
EVENT_ATTR_STR(tx-commit, tx_commit, "event=0xc9,umask=0x2");
|
|
|
|
EVENT_ATTR_STR(tx-abort, tx_abort, "event=0xc9,umask=0x4");
|
|
|
|
EVENT_ATTR_STR(tx-capacity, tx_capacity, "event=0x54,umask=0x2");
|
|
|
|
EVENT_ATTR_STR(tx-conflict, tx_conflict, "event=0x54,umask=0x1");
|
|
|
|
EVENT_ATTR_STR(el-start, el_start, "event=0xc8,umask=0x1");
|
|
|
|
EVENT_ATTR_STR(el-commit, el_commit, "event=0xc8,umask=0x2");
|
|
|
|
EVENT_ATTR_STR(el-abort, el_abort, "event=0xc8,umask=0x4");
|
|
|
|
EVENT_ATTR_STR(el-capacity, el_capacity, "event=0x54,umask=0x2");
|
|
|
|
EVENT_ATTR_STR(el-conflict, el_conflict, "event=0x54,umask=0x1");
|
|
|
|
EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1");
|
|
|
|
EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1");
|
2013-09-06 11:37:40 +08:00
|
|
|
|
2013-06-18 08:36:52 +08:00
|
|
|
static struct attribute *hsw_events_attrs[] = {
|
2017-11-09 08:07:18 +08:00
|
|
|
EVENT_PTR(td_slots_issued),
|
|
|
|
EVENT_PTR(td_slots_retired),
|
|
|
|
EVENT_PTR(td_fetch_bubbles),
|
|
|
|
EVENT_PTR(td_total_slots),
|
|
|
|
EVENT_PTR(td_total_slots_scale),
|
|
|
|
EVENT_PTR(td_recovery_bubbles),
|
|
|
|
EVENT_PTR(td_recovery_bubbles_scale),
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2018-09-06 21:57:48 +08:00
|
|
|
static struct attribute *hsw_mem_events_attrs[] = {
|
|
|
|
EVENT_PTR(mem_ld_hsw),
|
|
|
|
EVENT_PTR(mem_st_hsw),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2017-11-09 08:07:18 +08:00
|
|
|
static struct attribute *hsw_tsx_events_attrs[] = {
|
2013-09-06 11:37:40 +08:00
|
|
|
EVENT_PTR(tx_start),
|
|
|
|
EVENT_PTR(tx_commit),
|
|
|
|
EVENT_PTR(tx_abort),
|
|
|
|
EVENT_PTR(tx_capacity),
|
|
|
|
EVENT_PTR(tx_conflict),
|
|
|
|
EVENT_PTR(el_start),
|
|
|
|
EVENT_PTR(el_commit),
|
|
|
|
EVENT_PTR(el_abort),
|
|
|
|
EVENT_PTR(el_capacity),
|
|
|
|
EVENT_PTR(el_conflict),
|
|
|
|
EVENT_PTR(cycles_t),
|
|
|
|
EVENT_PTR(cycles_ct),
|
2013-06-18 08:36:52 +08:00
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2019-04-03 03:45:05 +08:00
|
|
|
EVENT_ATTR_STR(tx-capacity-read, tx_capacity_read, "event=0x54,umask=0x80");
|
|
|
|
EVENT_ATTR_STR(tx-capacity-write, tx_capacity_write, "event=0x54,umask=0x2");
|
|
|
|
EVENT_ATTR_STR(el-capacity-read, el_capacity_read, "event=0x54,umask=0x80");
|
|
|
|
EVENT_ATTR_STR(el-capacity-write, el_capacity_write, "event=0x54,umask=0x2");
|
|
|
|
|
|
|
|
static struct attribute *icl_events_attrs[] = {
|
|
|
|
EVENT_PTR(mem_ld_hsw),
|
|
|
|
EVENT_PTR(mem_st_hsw),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct attribute *icl_tsx_events_attrs[] = {
|
|
|
|
EVENT_PTR(tx_start),
|
|
|
|
EVENT_PTR(tx_abort),
|
|
|
|
EVENT_PTR(tx_commit),
|
|
|
|
EVENT_PTR(tx_capacity_read),
|
|
|
|
EVENT_PTR(tx_capacity_write),
|
|
|
|
EVENT_PTR(tx_conflict),
|
|
|
|
EVENT_PTR(el_start),
|
|
|
|
EVENT_PTR(el_abort),
|
|
|
|
EVENT_PTR(el_commit),
|
|
|
|
EVENT_PTR(el_capacity_read),
|
|
|
|
EVENT_PTR(el_capacity_write),
|
|
|
|
EVENT_PTR(el_conflict),
|
|
|
|
EVENT_PTR(cycles_t),
|
|
|
|
EVENT_PTR(cycles_ct),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2017-05-12 22:51:13 +08:00
|
|
|
static ssize_t freeze_on_smi_show(struct device *cdev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
return sprintf(buf, "%lu\n", x86_pmu.attr_freeze_on_smi);
|
|
|
|
}
|
|
|
|
|
|
|
|
static DEFINE_MUTEX(freeze_on_smi_mutex);
|
|
|
|
|
|
|
|
static ssize_t freeze_on_smi_store(struct device *cdev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
unsigned long val;
|
|
|
|
ssize_t ret;
|
|
|
|
|
|
|
|
ret = kstrtoul(buf, 0, &val);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (val > 1)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
mutex_lock(&freeze_on_smi_mutex);
|
|
|
|
|
|
|
|
if (x86_pmu.attr_freeze_on_smi == val)
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
x86_pmu.attr_freeze_on_smi = val;
|
|
|
|
|
|
|
|
get_online_cpus();
|
|
|
|
on_each_cpu(flip_smm_bit, &val, 1);
|
|
|
|
put_online_cpus();
|
|
|
|
done:
|
|
|
|
mutex_unlock(&freeze_on_smi_mutex);
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Force resched when TFA sysctl is modified
This patch provides guarantee to the sysadmin that when TFA is disabled, no PMU
event is using PMC3 when the echo command returns. Vice-Versa, when TFA
is enabled, PMU can use PMC3 immediately (to eliminate possible multiplexing).
$ perf stat -a -I 1000 --no-merge -e branches,branches,branches,branches
1.000123979 125,768,725,208 branches
1.000562520 125,631,000,456 branches
1.000942898 125,487,114,291 branches
1.001333316 125,323,363,620 branches
2.004721306 125,514,968,546 branches
2.005114560 125,511,110,861 branches
2.005482722 125,510,132,724 branches
2.005851245 125,508,967,086 branches
3.006323475 125,166,570,648 branches
3.006709247 125,165,650,056 branches
3.007086605 125,164,639,142 branches
3.007459298 125,164,402,912 branches
4.007922698 125,045,577,140 branches
4.008310775 125,046,804,324 branches
4.008670814 125,048,265,111 branches
4.009039251 125,048,677,611 branches
5.009503373 125,122,240,217 branches
5.009897067 125,122,450,517 branches
Then on another connection, sysadmin does:
$ echo 1 >/sys/devices/cpu/allow_tsx_force_abort
Then perf stat adjusts the events immediately:
5.010286029 125,121,393,483 branches
5.010646308 125,120,556,786 branches
6.011113588 124,963,351,832 branches
6.011510331 124,964,267,566 branches
6.011889913 124,964,829,130 branches
6.012262996 124,965,841,156 branches
7.012708299 124,419,832,234 branches [79.69%]
7.012847908 124,416,363,853 branches [79.73%]
7.013225462 124,400,723,712 branches [79.73%]
7.013598191 124,376,154,434 branches [79.70%]
8.014089834 124,250,862,693 branches [74.98%]
8.014481363 124,267,539,139 branches [74.94%]
8.014856006 124,259,519,786 branches [74.98%]
8.014980848 124,225,457,969 branches [75.04%]
9.015464576 124,204,235,423 branches [75.03%]
9.015858587 124,204,988,490 branches [75.04%]
9.016243680 124,220,092,486 branches [74.99%]
9.016620104 124,231,260,146 branches [74.94%]
And vice-versa if the syadmin does:
$ echo 0 >/sys/devices/cpu/allow_tsx_force_abort
Events are again spread over the 4 counters:
10.017096277 124,276,230,565 branches [74.96%]
10.017237209 124,228,062,171 branches [75.03%]
10.017478637 124,178,780,626 branches [75.03%]
10.017853402 124,198,316,177 branches [75.03%]
11.018334423 124,602,418,933 branches [85.40%]
11.018722584 124,602,921,320 branches [85.42%]
11.019095621 124,603,956,093 branches [85.42%]
11.019467742 124,595,273,783 branches [85.42%]
12.019945736 125,110,114,864 branches
12.020330764 125,109,334,472 branches
12.020688740 125,109,818,865 branches
12.021054020 125,108,594,014 branches
13.021516774 125,109,164,018 branches
13.021903640 125,108,794,510 branches
13.022270770 125,107,756,978 branches
13.022630819 125,109,380,471 branches
14.023114989 125,133,140,817 branches
14.023501880 125,133,785,858 branches
14.023868339 125,133,852,700 branches
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: kan.liang@intel.com
Cc: nelson.dsouza@intel.com
Cc: tonyj@suse.com
Link: https://lkml.kernel.org/r/20190408173252.37932-3-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-04-09 01:32:52 +08:00
|
|
|
static void update_tfa_sched(void *ignored)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* check if PMC3 is used
|
|
|
|
* and if so force schedule out for all event types all contexts
|
|
|
|
*/
|
|
|
|
if (test_bit(3, cpuc->active_mask))
|
|
|
|
perf_pmu_resched(x86_get_pmu());
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t show_sysctl_tfa(struct device *cdev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
return snprintf(buf, 40, "%d\n", allow_tsx_force_abort);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t set_sysctl_tfa(struct device *cdev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
bool val;
|
|
|
|
ssize_t ret;
|
|
|
|
|
|
|
|
ret = kstrtobool(buf, &val);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
/* no change */
|
|
|
|
if (val == allow_tsx_force_abort)
|
|
|
|
return count;
|
|
|
|
|
|
|
|
allow_tsx_force_abort = val;
|
|
|
|
|
|
|
|
get_online_cpus();
|
|
|
|
on_each_cpu(update_tfa_sched, NULL, 1);
|
|
|
|
put_online_cpus();
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-05-12 22:51:13 +08:00
|
|
|
static DEVICE_ATTR_RW(freeze_on_smi);
|
|
|
|
|
2017-08-23 02:52:01 +08:00
|
|
|
static ssize_t branches_show(struct device *cdev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static DEVICE_ATTR_RO(branches);
|
|
|
|
|
|
|
|
static struct attribute *lbr_attrs[] = {
|
|
|
|
&dev_attr_branches.attr,
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
|
|
|
static char pmu_name_str[30];
|
|
|
|
|
|
|
|
static ssize_t pmu_name_show(struct device *cdev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
return snprintf(buf, PAGE_SIZE, "%s\n", pmu_name_str);
|
|
|
|
}
|
|
|
|
|
|
|
|
static DEVICE_ATTR_RO(pmu_name);
|
|
|
|
|
|
|
|
static struct attribute *intel_pmu_caps_attrs[] = {
|
2017-08-28 18:46:50 +08:00
|
|
|
&dev_attr_pmu_name.attr,
|
|
|
|
NULL
|
2017-08-23 02:52:01 +08:00
|
|
|
};
|
|
|
|
|
perf/x86/intel: Force resched when TFA sysctl is modified
This patch provides guarantee to the sysadmin that when TFA is disabled, no PMU
event is using PMC3 when the echo command returns. Vice-Versa, when TFA
is enabled, PMU can use PMC3 immediately (to eliminate possible multiplexing).
$ perf stat -a -I 1000 --no-merge -e branches,branches,branches,branches
1.000123979 125,768,725,208 branches
1.000562520 125,631,000,456 branches
1.000942898 125,487,114,291 branches
1.001333316 125,323,363,620 branches
2.004721306 125,514,968,546 branches
2.005114560 125,511,110,861 branches
2.005482722 125,510,132,724 branches
2.005851245 125,508,967,086 branches
3.006323475 125,166,570,648 branches
3.006709247 125,165,650,056 branches
3.007086605 125,164,639,142 branches
3.007459298 125,164,402,912 branches
4.007922698 125,045,577,140 branches
4.008310775 125,046,804,324 branches
4.008670814 125,048,265,111 branches
4.009039251 125,048,677,611 branches
5.009503373 125,122,240,217 branches
5.009897067 125,122,450,517 branches
Then on another connection, sysadmin does:
$ echo 1 >/sys/devices/cpu/allow_tsx_force_abort
Then perf stat adjusts the events immediately:
5.010286029 125,121,393,483 branches
5.010646308 125,120,556,786 branches
6.011113588 124,963,351,832 branches
6.011510331 124,964,267,566 branches
6.011889913 124,964,829,130 branches
6.012262996 124,965,841,156 branches
7.012708299 124,419,832,234 branches [79.69%]
7.012847908 124,416,363,853 branches [79.73%]
7.013225462 124,400,723,712 branches [79.73%]
7.013598191 124,376,154,434 branches [79.70%]
8.014089834 124,250,862,693 branches [74.98%]
8.014481363 124,267,539,139 branches [74.94%]
8.014856006 124,259,519,786 branches [74.98%]
8.014980848 124,225,457,969 branches [75.04%]
9.015464576 124,204,235,423 branches [75.03%]
9.015858587 124,204,988,490 branches [75.04%]
9.016243680 124,220,092,486 branches [74.99%]
9.016620104 124,231,260,146 branches [74.94%]
And vice-versa if the syadmin does:
$ echo 0 >/sys/devices/cpu/allow_tsx_force_abort
Events are again spread over the 4 counters:
10.017096277 124,276,230,565 branches [74.96%]
10.017237209 124,228,062,171 branches [75.03%]
10.017478637 124,178,780,626 branches [75.03%]
10.017853402 124,198,316,177 branches [75.03%]
11.018334423 124,602,418,933 branches [85.40%]
11.018722584 124,602,921,320 branches [85.42%]
11.019095621 124,603,956,093 branches [85.42%]
11.019467742 124,595,273,783 branches [85.42%]
12.019945736 125,110,114,864 branches
12.020330764 125,109,334,472 branches
12.020688740 125,109,818,865 branches
12.021054020 125,108,594,014 branches
13.021516774 125,109,164,018 branches
13.021903640 125,108,794,510 branches
13.022270770 125,107,756,978 branches
13.022630819 125,109,380,471 branches
14.023114989 125,133,140,817 branches
14.023501880 125,133,785,858 branches
14.023868339 125,133,852,700 branches
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: kan.liang@intel.com
Cc: nelson.dsouza@intel.com
Cc: tonyj@suse.com
Link: https://lkml.kernel.org/r/20190408173252.37932-3-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-04-09 01:32:52 +08:00
|
|
|
static DEVICE_ATTR(allow_tsx_force_abort, 0644,
|
|
|
|
show_sysctl_tfa,
|
|
|
|
set_sysctl_tfa);
|
2019-03-06 05:23:18 +08:00
|
|
|
|
2017-05-12 22:51:13 +08:00
|
|
|
static struct attribute *intel_pmu_attrs[] = {
|
|
|
|
&dev_attr_freeze_on_smi.attr,
|
2019-05-24 21:21:52 +08:00
|
|
|
&dev_attr_allow_tsx_force_abort.attr,
|
2017-05-12 22:51:13 +08:00
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
static umode_t
|
|
|
|
tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i)
|
2018-09-06 21:57:48 +08:00
|
|
|
{
|
2019-05-12 23:55:13 +08:00
|
|
|
return boot_cpu_has(X86_FEATURE_RTM) ? attr->mode : 0;
|
|
|
|
}
|
2018-09-06 21:57:48 +08:00
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
static umode_t
|
|
|
|
pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i)
|
|
|
|
{
|
|
|
|
return x86_pmu.pebs ? attr->mode : 0;
|
|
|
|
}
|
2018-09-06 21:57:48 +08:00
|
|
|
|
2019-05-12 23:55:15 +08:00
|
|
|
static umode_t
|
|
|
|
lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i)
|
|
|
|
{
|
|
|
|
return x86_pmu.lbr_nr ? attr->mode : 0;
|
|
|
|
}
|
|
|
|
|
2019-05-12 23:55:16 +08:00
|
|
|
static umode_t
|
|
|
|
exra_is_visible(struct kobject *kobj, struct attribute *attr, int i)
|
|
|
|
{
|
|
|
|
return x86_pmu.version >= 2 ? attr->mode : 0;
|
|
|
|
}
|
|
|
|
|
2019-05-24 21:21:52 +08:00
|
|
|
static umode_t
|
|
|
|
default_is_visible(struct kobject *kobj, struct attribute *attr, int i)
|
|
|
|
{
|
|
|
|
if (attr == &dev_attr_allow_tsx_force_abort.attr)
|
|
|
|
return x86_pmu.flags & PMU_FL_TFA ? attr->mode : 0;
|
|
|
|
|
|
|
|
return attr->mode;
|
|
|
|
}
|
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
static struct attribute_group group_events_td = {
|
|
|
|
.name = "events",
|
|
|
|
};
|
2018-09-06 21:57:48 +08:00
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
static struct attribute_group group_events_mem = {
|
|
|
|
.name = "events",
|
|
|
|
.is_visible = pebs_is_visible,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct attribute_group group_events_tsx = {
|
|
|
|
.name = "events",
|
|
|
|
.is_visible = tsx_is_visible,
|
|
|
|
};
|
|
|
|
|
2019-05-12 23:55:15 +08:00
|
|
|
static struct attribute_group group_caps_gen = {
|
|
|
|
.name = "caps",
|
|
|
|
.attrs = intel_pmu_caps_attrs,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct attribute_group group_caps_lbr = {
|
|
|
|
.name = "caps",
|
|
|
|
.attrs = lbr_attrs,
|
|
|
|
.is_visible = lbr_is_visible,
|
|
|
|
};
|
|
|
|
|
2019-05-12 23:55:16 +08:00
|
|
|
static struct attribute_group group_format_extra = {
|
|
|
|
.name = "format",
|
|
|
|
.is_visible = exra_is_visible,
|
|
|
|
};
|
|
|
|
|
2019-05-12 23:55:17 +08:00
|
|
|
static struct attribute_group group_format_extra_skl = {
|
|
|
|
.name = "format",
|
|
|
|
.is_visible = exra_is_visible,
|
|
|
|
};
|
|
|
|
|
2019-05-12 23:55:18 +08:00
|
|
|
static struct attribute_group group_default = {
|
2019-05-24 21:21:52 +08:00
|
|
|
.attrs = intel_pmu_attrs,
|
|
|
|
.is_visible = default_is_visible,
|
2019-05-12 23:55:18 +08:00
|
|
|
};
|
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
static const struct attribute_group *attr_update[] = {
|
|
|
|
&group_events_td,
|
|
|
|
&group_events_mem,
|
|
|
|
&group_events_tsx,
|
2019-05-12 23:55:15 +08:00
|
|
|
&group_caps_gen,
|
|
|
|
&group_caps_lbr,
|
2019-05-12 23:55:16 +08:00
|
|
|
&group_format_extra,
|
2019-05-12 23:55:17 +08:00
|
|
|
&group_format_extra_skl,
|
2019-05-12 23:55:18 +08:00
|
|
|
&group_default,
|
2019-05-12 23:55:13 +08:00
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct attribute *empty_attrs;
|
2018-09-06 21:57:48 +08:00
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
__init int intel_pmu_init(void)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2019-05-12 23:55:17 +08:00
|
|
|
struct attribute **extra_skl_attr = &empty_attrs;
|
2019-05-12 23:55:13 +08:00
|
|
|
struct attribute **extra_attr = &empty_attrs;
|
|
|
|
struct attribute **td_attr = &empty_attrs;
|
|
|
|
struct attribute **mem_attr = &empty_attrs;
|
|
|
|
struct attribute **tsx_attr = &empty_attrs;
|
2010-02-26 19:05:05 +08:00
|
|
|
union cpuid10_edx edx;
|
|
|
|
union cpuid10_eax eax;
|
2011-11-10 20:57:26 +08:00
|
|
|
union cpuid10_ebx ebx;
|
2012-06-21 02:46:34 +08:00
|
|
|
struct event_constraint *c;
|
2010-02-26 19:05:05 +08:00
|
|
|
unsigned int unused;
|
2014-07-15 03:25:56 +08:00
|
|
|
struct extra_reg *er;
|
2019-06-03 21:41:21 +08:00
|
|
|
bool pmem = false;
|
2014-07-15 03:25:56 +08:00
|
|
|
int version, i;
|
2017-08-23 02:52:01 +08:00
|
|
|
char *name;
|
2010-02-26 19:05:05 +08:00
|
|
|
|
|
|
|
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
|
2010-03-12 00:54:39 +08:00
|
|
|
switch (boot_cpu_data.x86) {
|
|
|
|
case 0x6:
|
|
|
|
return p6_pmu_init();
|
2012-09-27 02:12:52 +08:00
|
|
|
case 0xb:
|
|
|
|
return knc_pmu_init();
|
2010-03-12 00:54:39 +08:00
|
|
|
case 0xf:
|
|
|
|
return p4_pmu_init();
|
|
|
|
}
|
2010-02-26 19:05:05 +08:00
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether the Architectural PerfMon supports
|
|
|
|
* Branch Misses Retired hw_event or not.
|
|
|
|
*/
|
2011-11-10 20:57:26 +08:00
|
|
|
cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
|
|
|
|
if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
|
2010-02-26 19:05:05 +08:00
|
|
|
return -ENODEV;
|
|
|
|
|
|
|
|
version = eax.split.version_id;
|
|
|
|
if (version < 2)
|
|
|
|
x86_pmu = core_pmu;
|
|
|
|
else
|
|
|
|
x86_pmu = intel_pmu;
|
|
|
|
|
|
|
|
x86_pmu.version = version;
|
2010-03-30 00:36:50 +08:00
|
|
|
x86_pmu.num_counters = eax.split.num_counters;
|
|
|
|
x86_pmu.cntval_bits = eax.split.bit_width;
|
|
|
|
x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
x86_pmu.events_maskl = ebx.full;
|
|
|
|
x86_pmu.events_mask_len = eax.split.mask_length;
|
|
|
|
|
2012-06-06 08:56:48 +08:00
|
|
|
x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
|
|
|
* Quirk: v2 perfmon does not report fixed-purpose events, so
|
2016-10-21 16:18:59 +08:00
|
|
|
* assume at least 3 events, when not running in a hypervisor:
|
2010-02-26 19:05:05 +08:00
|
|
|
*/
|
2016-10-21 16:18:59 +08:00
|
|
|
if (version > 1) {
|
|
|
|
int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR);
|
|
|
|
|
|
|
|
x86_pmu.num_counters_fixed =
|
|
|
|
max((int)edx.split.num_counters_fixed, assume);
|
|
|
|
}
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2018-08-08 15:12:07 +08:00
|
|
|
if (version >= 4)
|
|
|
|
x86_pmu.counter_freezing = !disable_counter_freezing;
|
|
|
|
|
2014-02-03 21:29:03 +08:00
|
|
|
if (boot_cpu_has(X86_FEATURE_PDCM)) {
|
2010-03-04 00:07:40 +08:00
|
|
|
u64 capabilities;
|
|
|
|
|
|
|
|
rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
|
|
|
|
x86_pmu.intel_cap.capabilities = capabilities;
|
|
|
|
}
|
|
|
|
|
2020-07-03 20:49:09 +08:00
|
|
|
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) {
|
2020-07-03 20:49:08 +08:00
|
|
|
x86_pmu.lbr_reset = intel_pmu_lbr_reset_32;
|
2020-07-03 20:49:09 +08:00
|
|
|
x86_pmu.lbr_read = intel_pmu_lbr_read_32;
|
|
|
|
}
|
2020-07-03 20:49:08 +08:00
|
|
|
|
perf/x86/intel/lbr: Support Architectural LBR
Last Branch Records (LBR) enables recording of software path history by
logging taken branches and other control flows within architectural
registers now. Intel CPUs have had model-specific LBR for quite some
time, but this evolves them into an architectural feature now.
The main improvements of Architectural LBR implemented includes:
- Linux kernel can support the LBR features without knowing the model
number of the current CPU.
- Architectural LBR capabilities can be enumerated by CPUID. The
lbr_ctl_map is based on the CPUID Enumeration.
- The possible LBR depth can be retrieved from CPUID enumeration. The
max value is written to the new MSR_ARCH_LBR_DEPTH as the number of
LBR entries.
- A new IA32_LBR_CTL MSR is introduced to enable and configure LBRs,
which replaces the IA32_DEBUGCTL[bit 0] and the LBR_SELECT MSR.
- Each LBR record or entry is still comprised of three MSRs,
IA32_LBR_x_FROM_IP, IA32_LBR_x_TO_IP and IA32_LBR_x_TO_IP.
But they become the architectural MSRs.
- Architectural LBR is stack-like now. Entry 0 is always the youngest
branch, entry 1 the next youngest... The TOS MSR has been removed.
The way to enable/disable Architectural LBR is similar to the previous
model-specific LBR. __intel_pmu_lbr_enable/disable() can be reused, but
some modifications are required, which include:
- MSR_ARCH_LBR_CTL is used to enable and configure the Architectural
LBR.
- When checking the value of the IA32_DEBUGCTL MSR, ignoring the
DEBUGCTLMSR_LBR (bit 0) for Architectural LBR, which has no meaning
and always return 0.
- The FREEZE_LBRS_ON_PMI has to be explicitly set/clear, because
MSR_IA32_DEBUGCTLMSR is not touched in __intel_pmu_lbr_disable() for
Architectural LBR.
- Only MSR_ARCH_LBR_CTL is cleared in __intel_pmu_lbr_disable() for
Architectural LBR.
Some Architectural LBR dedicated functions are implemented to
reset/read/save/restore LBR.
- For reset, writing to the ARCH_LBR_DEPTH MSR clears all Arch LBR
entries, which is a lot faster and can improve the context switch
latency.
- For read, the branch type information can be retrieved from
the MSR_ARCH_LBR_INFO_*. But it's not fully compatible due to
OTHER_BRANCH type. The software decoding is still required for the
OTHER_BRANCH case.
LBR records are stored in the age order as well. Reuse
intel_pmu_store_lbr(). Check the CPUID enumeration before accessing
the corresponding bits in LBR_INFO.
- For save/restore, applying the fast reset (writing ARCH_LBR_DEPTH).
Reading 'lbr_from' of entry 0 instead of the TOS MSR to check if the
LBR registers are reset in the deep C-state. If 'the deep C-state
reset' bit is not set in CPUID enumeration, ignoring the check.
XSAVE support for Architectural LBR will be implemented later.
The number of LBR entries cannot be hardcoded anymore, which should be
retrieved from CPUID enumeration. A new structure
x86_perf_task_context_arch_lbr is introduced for Architectural LBR.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-15-git-send-email-kan.liang@linux.intel.com
2020-07-03 20:49:20 +08:00
|
|
|
if (boot_cpu_has(X86_FEATURE_ARCH_LBR))
|
|
|
|
intel_pmu_arch_lbr_init();
|
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
intel_ds_init();
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
|
|
|
* Install the hw-cache-events table:
|
|
|
|
*/
|
|
|
|
switch (boot_cpu_data.x86_model) {
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_CORE_YONAH:
|
2010-02-26 19:05:05 +08:00
|
|
|
pr_cont("Core events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "core";
|
2010-02-26 19:05:05 +08:00
|
|
|
break;
|
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_CORE2_MEROM:
|
2011-12-06 21:07:15 +08:00
|
|
|
x86_add_quirk(intel_clovertown_quirk);
|
2019-01-26 02:49:17 +08:00
|
|
|
/* fall through */
|
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_CORE2_MEROM_L:
|
|
|
|
case INTEL_FAM6_CORE2_PENRYN:
|
|
|
|
case INTEL_FAM6_CORE2_DUNNINGTON:
|
2010-02-26 19:05:05 +08:00
|
|
|
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
|
2010-03-03 19:02:30 +08:00
|
|
|
intel_pmu_lbr_init_core();
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
x86_pmu.event_constraints = intel_core2_event_constraints;
|
2011-03-02 23:05:01 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
|
2010-02-26 19:05:05 +08:00
|
|
|
pr_cont("Core2 events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "core2";
|
2010-02-26 19:05:05 +08:00
|
|
|
break;
|
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_NEHALEM:
|
|
|
|
case INTEL_FAM6_NEHALEM_EP:
|
|
|
|
case INTEL_FAM6_NEHALEM_EX:
|
2010-02-26 19:05:05 +08:00
|
|
|
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
2011-03-03 10:34:48 +08:00
|
|
|
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2010-03-03 19:02:30 +08:00
|
|
|
intel_pmu_lbr_init_nhm();
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
x86_pmu.event_constraints = intel_nehalem_event_constraints;
|
2011-03-02 23:05:01 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
|
2010-03-26 21:08:44 +08:00
|
|
|
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
|
2011-03-03 10:34:47 +08:00
|
|
|
x86_pmu.extra_regs = intel_nehalem_extra_regs;
|
2019-08-20 07:13:31 +08:00
|
|
|
x86_pmu.limit_period = nhm_limit_period;
|
2011-04-27 17:51:41 +08:00
|
|
|
|
2018-09-06 21:57:48 +08:00
|
|
|
mem_attr = nhm_mem_events_attrs;
|
2013-01-24 23:10:32 +08:00
|
|
|
|
2011-04-29 20:17:19 +08:00
|
|
|
/* UOPS_ISSUED.STALLED_CYCLES */
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
|
|
|
|
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
|
2011-04-29 20:17:19 +08:00
|
|
|
/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
|
|
|
|
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
|
2011-04-24 14:18:31 +08:00
|
|
|
|
2016-03-02 06:25:24 +08:00
|
|
|
intel_pmu_pebs_data_source_nhm();
|
2011-12-06 21:07:15 +08:00
|
|
|
x86_add_quirk(intel_nehalem_quirk);
|
2017-08-17 06:21:53 +08:00
|
|
|
x86_pmu.pebs_no_tlb = 1;
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = nhm_format_attr;
|
2011-04-27 17:51:41 +08:00
|
|
|
|
2010-03-26 21:08:44 +08:00
|
|
|
pr_cont("Nehalem events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "nehalem";
|
2010-02-26 19:05:05 +08:00
|
|
|
break;
|
2010-03-03 19:02:30 +08:00
|
|
|
|
2018-08-08 01:17:27 +08:00
|
|
|
case INTEL_FAM6_ATOM_BONNELL:
|
|
|
|
case INTEL_FAM6_ATOM_BONNELL_MID:
|
|
|
|
case INTEL_FAM6_ATOM_SALTWELL:
|
|
|
|
case INTEL_FAM6_ATOM_SALTWELL_MID:
|
|
|
|
case INTEL_FAM6_ATOM_SALTWELL_TABLET:
|
2010-02-26 19:05:05 +08:00
|
|
|
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
|
2010-03-03 19:02:30 +08:00
|
|
|
intel_pmu_lbr_init_atom();
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
x86_pmu.event_constraints = intel_gen_event_constraints;
|
2011-03-02 23:05:01 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
|
2015-12-04 04:03:10 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
|
2010-02-26 19:05:05 +08:00
|
|
|
pr_cont("Atom events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "bonnell";
|
2010-02-26 19:05:05 +08:00
|
|
|
break;
|
|
|
|
|
2018-08-08 01:17:27 +08:00
|
|
|
case INTEL_FAM6_ATOM_SILVERMONT:
|
2019-08-28 03:48:24 +08:00
|
|
|
case INTEL_FAM6_ATOM_SILVERMONT_D:
|
2018-08-08 01:17:27 +08:00
|
|
|
case INTEL_FAM6_ATOM_SILVERMONT_MID:
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_ATOM_AIRMONT:
|
2018-08-08 01:17:27 +08:00
|
|
|
case INTEL_FAM6_ATOM_AIRMONT_MID:
|
2013-07-18 17:02:24 +08:00
|
|
|
memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
|
|
|
|
2016-04-15 15:53:45 +08:00
|
|
|
intel_pmu_lbr_init_slm();
|
2013-07-18 17:02:24 +08:00
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_slm_event_constraints;
|
|
|
|
x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_slm_extra_regs;
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = slm_events_attrs;
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = slm_format_attr;
|
2013-07-18 17:02:24 +08:00
|
|
|
pr_cont("Silvermont events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "silvermont";
|
2013-07-18 17:02:24 +08:00
|
|
|
break;
|
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_ATOM_GOLDMONT:
|
2019-08-28 03:48:24 +08:00
|
|
|
case INTEL_FAM6_ATOM_GOLDMONT_D:
|
2019-02-05 06:23:33 +08:00
|
|
|
x86_add_quirk(intel_counter_freezing_quirk);
|
2016-04-15 15:42:47 +08:00
|
|
|
memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
|
|
|
|
|
|
|
intel_pmu_lbr_init_skl();
|
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_slm_event_constraints;
|
|
|
|
x86_pmu.pebs_constraints = intel_glm_pebs_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_glm_extra_regs;
|
|
|
|
/*
|
|
|
|
* It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
|
|
|
|
* for precise cycles.
|
|
|
|
* :pp is identical to :ppp
|
|
|
|
*/
|
|
|
|
x86_pmu.pebs_aliases = NULL;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
2016-04-28 23:35:46 +08:00
|
|
|
x86_pmu.lbr_pt_coexist = true;
|
2016-04-15 15:42:47 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = glm_events_attrs;
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = slm_format_attr;
|
2016-04-15 15:42:47 +08:00
|
|
|
pr_cont("Goldmont events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "goldmont";
|
2016-04-15 15:42:47 +08:00
|
|
|
break;
|
|
|
|
|
2018-08-08 01:17:27 +08:00
|
|
|
case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
|
2019-02-05 06:23:32 +08:00
|
|
|
x86_add_quirk(intel_counter_freezing_quirk);
|
2017-07-12 21:44:23 +08:00
|
|
|
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
|
|
|
|
|
|
|
intel_pmu_lbr_init_skl();
|
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_slm_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_glm_extra_regs;
|
|
|
|
/*
|
|
|
|
* It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
|
|
|
|
* for precise cycles.
|
|
|
|
*/
|
|
|
|
x86_pmu.pebs_aliases = NULL;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
|
|
|
x86_pmu.lbr_pt_coexist = true;
|
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
2018-03-09 10:15:42 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_PEBS_ALL;
|
2017-07-12 21:44:23 +08:00
|
|
|
x86_pmu.get_event_constraints = glp_get_event_constraints;
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = glm_events_attrs;
|
2017-07-12 21:44:23 +08:00
|
|
|
/* Goldmont Plus has 4-wide pipeline */
|
|
|
|
event_attr_td_total_slots_scale_glm.event_str = "4";
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = slm_format_attr;
|
2017-07-12 21:44:23 +08:00
|
|
|
pr_cont("Goldmont plus events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "goldmont_plus";
|
2017-07-12 21:44:23 +08:00
|
|
|
break;
|
|
|
|
|
2019-08-28 03:48:24 +08:00
|
|
|
case INTEL_FAM6_ATOM_TREMONT_D:
|
2020-01-29 02:31:17 +08:00
|
|
|
case INTEL_FAM6_ATOM_TREMONT:
|
2019-04-11 02:57:09 +08:00
|
|
|
x86_pmu.late_ack = true;
|
|
|
|
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
|
|
|
hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
|
|
|
|
|
|
|
|
intel_pmu_lbr_init_skl();
|
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_slm_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_tnt_extra_regs;
|
|
|
|
/*
|
|
|
|
* It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
|
|
|
|
* for precise cycles.
|
|
|
|
*/
|
|
|
|
x86_pmu.pebs_aliases = NULL;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
|
|
|
x86_pmu.lbr_pt_coexist = true;
|
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.get_event_constraints = tnt_get_event_constraints;
|
|
|
|
extra_attr = slm_format_attr;
|
|
|
|
pr_cont("Tremont events, ");
|
|
|
|
name = "Tremont";
|
|
|
|
break;
|
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_WESTMERE:
|
|
|
|
case INTEL_FAM6_WESTMERE_EP:
|
|
|
|
case INTEL_FAM6_WESTMERE_EX:
|
2010-02-26 19:05:05 +08:00
|
|
|
memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
2011-03-03 10:34:48 +08:00
|
|
|
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2010-03-03 19:02:30 +08:00
|
|
|
intel_pmu_lbr_init_nhm();
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
x86_pmu.event_constraints = intel_westmere_event_constraints;
|
2010-03-29 22:37:17 +08:00
|
|
|
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
|
2011-03-02 23:05:01 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
|
2011-03-03 10:34:47 +08:00
|
|
|
x86_pmu.extra_regs = intel_westmere_extra_regs;
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
2011-04-30 15:14:54 +08:00
|
|
|
|
2018-09-06 21:57:48 +08:00
|
|
|
mem_attr = nhm_mem_events_attrs;
|
2013-01-24 23:10:32 +08:00
|
|
|
|
2011-04-30 15:14:54 +08:00
|
|
|
/* UOPS_ISSUED.STALLED_CYCLES */
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
|
|
|
|
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
|
2011-04-30 15:14:54 +08:00
|
|
|
/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
|
|
|
|
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
|
2011-04-30 15:14:54 +08:00
|
|
|
|
2016-03-02 06:25:24 +08:00
|
|
|
intel_pmu_pebs_data_source_nhm();
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = nhm_format_attr;
|
2010-02-26 19:05:05 +08:00
|
|
|
pr_cont("Westmere events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "westmere";
|
2010-02-26 19:05:05 +08:00
|
|
|
break;
|
2010-02-01 22:36:30 +08:00
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_SANDYBRIDGE:
|
|
|
|
case INTEL_FAM6_SANDYBRIDGE_X:
|
2012-06-05 16:26:43 +08:00
|
|
|
x86_add_quirk(intel_sandybridge_quirk);
|
2014-11-18 03:06:59 +08:00
|
|
|
x86_add_quirk(intel_ht_bug);
|
2011-03-02 21:27:04 +08:00
|
|
|
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
2012-07-17 17:27:55 +08:00
|
|
|
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
2011-03-02 21:27:04 +08:00
|
|
|
|
2012-02-10 06:20:55 +08:00
|
|
|
intel_pmu_lbr_init_snb();
|
2011-03-02 21:27:04 +08:00
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_snb_event_constraints;
|
2011-08-31 07:41:05 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
|
2012-06-05 16:26:43 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
|
2016-06-03 08:19:29 +08:00
|
|
|
if (boot_cpu_data.x86_model == INTEL_FAM6_SANDYBRIDGE_X)
|
2013-04-16 19:51:43 +08:00
|
|
|
x86_pmu.extra_regs = intel_snbep_extra_regs;
|
|
|
|
else
|
|
|
|
x86_pmu.extra_regs = intel_snb_extra_regs;
|
2014-11-18 03:06:59 +08:00
|
|
|
|
|
|
|
|
2011-06-06 22:57:12 +08:00
|
|
|
/* all extra regs are per-cpu when HT is on */
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
2011-05-06 15:14:02 +08:00
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = snb_events_attrs;
|
2018-09-06 21:57:48 +08:00
|
|
|
mem_attr = snb_mem_events_attrs;
|
2013-01-24 23:10:32 +08:00
|
|
|
|
2011-05-06 15:14:02 +08:00
|
|
|
/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
|
|
|
|
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
|
2011-05-06 15:14:02 +08:00
|
|
|
/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
|
|
|
|
X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
|
2011-05-06 15:14:02 +08:00
|
|
|
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = nhm_format_attr;
|
|
|
|
|
2011-03-02 21:27:04 +08:00
|
|
|
pr_cont("SandyBridge events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "sandybridge";
|
2011-03-02 21:27:04 +08:00
|
|
|
break;
|
2014-07-30 18:08:56 +08:00
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_IVYBRIDGE:
|
|
|
|
case INTEL_FAM6_IVYBRIDGE_X:
|
2014-11-18 03:06:59 +08:00
|
|
|
x86_add_quirk(intel_ht_bug);
|
2012-09-11 07:07:01 +08:00
|
|
|
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
2014-07-15 03:33:25 +08:00
|
|
|
/* dTLB-load-misses on IVB is different than SNB */
|
|
|
|
hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
|
|
|
|
|
2012-09-11 07:07:01 +08:00
|
|
|
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
|
|
|
|
|
|
|
intel_pmu_lbr_init_snb();
|
|
|
|
|
2013-02-20 18:15:12 +08:00
|
|
|
x86_pmu.event_constraints = intel_ivb_event_constraints;
|
2012-09-11 07:07:01 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
2016-06-03 08:19:29 +08:00
|
|
|
if (boot_cpu_data.x86_model == INTEL_FAM6_IVYBRIDGE_X)
|
2013-04-16 19:51:43 +08:00
|
|
|
x86_pmu.extra_regs = intel_snbep_extra_regs;
|
|
|
|
else
|
|
|
|
x86_pmu.extra_regs = intel_snb_extra_regs;
|
2012-09-11 07:07:01 +08:00
|
|
|
/* all extra regs are per-cpu when HT is on */
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
2012-09-11 07:07:01 +08:00
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = snb_events_attrs;
|
2018-09-06 21:57:48 +08:00
|
|
|
mem_attr = snb_mem_events_attrs;
|
2013-01-24 23:10:32 +08:00
|
|
|
|
2012-09-11 07:07:01 +08:00
|
|
|
/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
|
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
|
|
|
|
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
|
|
|
|
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = nhm_format_attr;
|
|
|
|
|
2012-09-11 07:07:01 +08:00
|
|
|
pr_cont("IvyBridge events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "ivybridge";
|
2012-09-11 07:07:01 +08:00
|
|
|
break;
|
|
|
|
|
2011-03-02 21:27:04 +08:00
|
|
|
|
2019-08-28 03:48:21 +08:00
|
|
|
case INTEL_FAM6_HASWELL:
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_HASWELL_X:
|
2019-08-28 03:48:22 +08:00
|
|
|
case INTEL_FAM6_HASWELL_L:
|
2019-08-28 03:48:23 +08:00
|
|
|
case INTEL_FAM6_HASWELL_G:
|
2014-11-18 03:06:59 +08:00
|
|
|
x86_add_quirk(intel_ht_bug);
|
2019-02-05 06:23:30 +08:00
|
|
|
x86_add_quirk(intel_pebs_isolation_quirk);
|
2013-06-18 08:36:50 +08:00
|
|
|
x86_pmu.late_ack = true;
|
2015-02-18 10:18:04 +08:00
|
|
|
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
2013-06-18 08:36:48 +08:00
|
|
|
|
perf/x86/intel: Add basic Haswell LBR call stack support
Haswell has a new feature that utilizes the existing LBR facility to
record call chains. To enable this feature, bits (JCC, NEAR_IND_JMP,
NEAR_REL_JMP, FAR_BRANCH, EN_CALLSTACK) in LBR_SELECT must be set to 1,
bits (NEAR_REL_CALL, NEAR-IND_CALL, NEAR_RET) must be cleared. Due to
a hardware bug of Haswell, this feature doesn't work well with
FREEZE_LBRS_ON_PMI.
When the call stack feature is enabled, the LBR stack will capture
unfiltered call data normally, but as return instructions are executed,
the last captured branch record is flushed from the on-chip registers
in a last-in first-out (LIFO) manner. Thus, branch information relative
to leaf functions will not be captured, while preserving the call stack
information of the main line execution path.
This patch defines a separate lbr_sel map for Haswell. The map contains
a new entry for the call stack feature.
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-5-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-05 10:56:00 +08:00
|
|
|
intel_pmu_lbr_init_hsw();
|
2013-06-18 08:36:48 +08:00
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_hsw_event_constraints;
|
2013-06-18 08:36:49 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
|
2014-08-01 05:05:22 +08:00
|
|
|
x86_pmu.extra_regs = intel_snbep_extra_regs;
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
2013-06-18 08:36:48 +08:00
|
|
|
/* all extra regs are per-cpu when HT is on */
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
2013-06-18 08:36:48 +08:00
|
|
|
|
|
|
|
x86_pmu.hw_config = hsw_hw_config;
|
|
|
|
x86_pmu.get_event_constraints = hsw_get_event_constraints;
|
2013-09-20 22:40:44 +08:00
|
|
|
x86_pmu.lbr_double_abort = true;
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
|
|
|
|
hsw_format_attr : nhm_format_attr;
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = hsw_events_attrs;
|
2018-09-06 21:57:48 +08:00
|
|
|
mem_attr = hsw_mem_events_attrs;
|
|
|
|
tsx_attr = hsw_tsx_events_attrs;
|
2013-06-18 08:36:48 +08:00
|
|
|
pr_cont("Haswell events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "haswell";
|
2013-06-18 08:36:48 +08:00
|
|
|
break;
|
|
|
|
|
2019-08-28 03:48:21 +08:00
|
|
|
case INTEL_FAM6_BROADWELL:
|
2019-08-28 03:48:24 +08:00
|
|
|
case INTEL_FAM6_BROADWELL_D:
|
2019-08-28 03:48:23 +08:00
|
|
|
case INTEL_FAM6_BROADWELL_G:
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_BROADWELL_X:
|
2019-02-05 06:23:30 +08:00
|
|
|
x86_add_quirk(intel_pebs_isolation_quirk);
|
2015-02-18 10:18:05 +08:00
|
|
|
x86_pmu.late_ack = true;
|
|
|
|
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
|
|
|
|
|
|
|
/* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
|
|
|
|
hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
|
|
|
|
BDW_L3_MISS|HSW_SNOOP_DRAM;
|
|
|
|
hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
|
|
|
|
HSW_SNOOP_DRAM;
|
|
|
|
hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
|
|
|
|
BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
|
|
|
|
hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
|
|
|
|
BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
|
|
|
|
|
2015-04-02 16:12:57 +08:00
|
|
|
intel_pmu_lbr_init_hsw();
|
2015-02-18 10:18:05 +08:00
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_bdw_event_constraints;
|
2016-03-04 03:50:42 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_bdw_pebs_event_constraints;
|
2015-02-18 10:18:05 +08:00
|
|
|
x86_pmu.extra_regs = intel_snbep_extra_regs;
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
2015-02-18 10:18:05 +08:00
|
|
|
/* all extra regs are per-cpu when HT is on */
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
2015-02-18 10:18:05 +08:00
|
|
|
|
|
|
|
x86_pmu.hw_config = hsw_hw_config;
|
|
|
|
x86_pmu.get_event_constraints = hsw_get_event_constraints;
|
perf/x86/intel: Add INST_RETIRED.ALL workarounds
On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.
This is erratum BDM11 and BDM55:
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.
Add a new callback to enforce this, and set it for Broadwell.
How does this handle the case when an app requests a specific
period with some of the bottom bits set?
Short answer:
Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.
Long answer (by Peterz):
IFF we guarantee perf_event_attr::sample_period >= 128.
Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.
So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.
So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.
So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-02-18 10:18:06 +08:00
|
|
|
x86_pmu.limit_period = bdw_limit_period;
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
|
|
|
|
hsw_format_attr : nhm_format_attr;
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = hsw_events_attrs;
|
2018-09-06 21:57:48 +08:00
|
|
|
mem_attr = hsw_mem_events_attrs;
|
|
|
|
tsx_attr = hsw_tsx_events_attrs;
|
2015-02-18 10:18:05 +08:00
|
|
|
pr_cont("Broadwell events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "broadwell";
|
2015-02-18 10:18:05 +08:00
|
|
|
break;
|
|
|
|
|
2016-06-03 08:19:29 +08:00
|
|
|
case INTEL_FAM6_XEON_PHI_KNL:
|
2016-10-13 02:26:34 +08:00
|
|
|
case INTEL_FAM6_XEON_PHI_KNM:
|
2015-12-08 06:28:18 +08:00
|
|
|
memcpy(hw_cache_event_ids,
|
|
|
|
slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs,
|
|
|
|
knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
|
|
|
intel_pmu_lbr_init_knl();
|
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_slm_event_constraints;
|
|
|
|
x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_knl_extra_regs;
|
|
|
|
|
|
|
|
/* all extra regs are per-cpu when HT is on */
|
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = slm_format_attr;
|
2016-10-13 02:26:34 +08:00
|
|
|
pr_cont("Knights Landing/Mill events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "knights-landing";
|
2015-12-08 06:28:18 +08:00
|
|
|
break;
|
|
|
|
|
2019-06-03 21:41:21 +08:00
|
|
|
case INTEL_FAM6_SKYLAKE_X:
|
|
|
|
pmem = true;
|
2019-06-25 00:19:13 +08:00
|
|
|
/* fall through */
|
2019-08-28 03:48:22 +08:00
|
|
|
case INTEL_FAM6_SKYLAKE_L:
|
2019-08-28 03:48:21 +08:00
|
|
|
case INTEL_FAM6_SKYLAKE:
|
2019-08-28 03:48:22 +08:00
|
|
|
case INTEL_FAM6_KABYLAKE_L:
|
2019-08-28 03:48:21 +08:00
|
|
|
case INTEL_FAM6_KABYLAKE:
|
2019-10-08 23:50:03 +08:00
|
|
|
case INTEL_FAM6_COMETLAKE_L:
|
|
|
|
case INTEL_FAM6_COMETLAKE:
|
2019-02-05 06:23:30 +08:00
|
|
|
x86_add_quirk(intel_pebs_isolation_quirk);
|
2015-05-11 03:22:44 +08:00
|
|
|
x86_pmu.late_ack = true;
|
|
|
|
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
|
|
|
intel_pmu_lbr_init_skl();
|
|
|
|
|
2016-05-20 08:09:57 +08:00
|
|
|
/* INT_MISC.RECOVERY_CYCLES has umask 1 in Skylake */
|
|
|
|
event_attr_td_recovery_bubbles.event_str_noht =
|
|
|
|
"event=0xd,umask=0x1,cmask=1";
|
|
|
|
event_attr_td_recovery_bubbles.event_str_ht =
|
|
|
|
"event=0xd,umask=0x1,cmask=1,any=1";
|
|
|
|
|
2015-05-11 03:22:44 +08:00
|
|
|
x86_pmu.event_constraints = intel_skl_event_constraints;
|
|
|
|
x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_skl_extra_regs;
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
2015-05-11 03:22:44 +08:00
|
|
|
/* all extra regs are per-cpu when HT is on */
|
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
|
|
|
|
|
|
|
x86_pmu.hw_config = hsw_hw_config;
|
|
|
|
x86_pmu.get_event_constraints = hsw_get_event_constraints;
|
2017-08-23 02:52:00 +08:00
|
|
|
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
|
|
|
|
hsw_format_attr : nhm_format_attr;
|
2019-05-12 23:55:17 +08:00
|
|
|
extra_skl_attr = skl_format_attr;
|
2019-05-12 23:55:13 +08:00
|
|
|
td_attr = hsw_events_attrs;
|
2018-09-06 21:57:48 +08:00
|
|
|
mem_attr = hsw_mem_events_attrs;
|
|
|
|
tsx_attr = hsw_tsx_events_attrs;
|
2019-06-03 21:41:21 +08:00
|
|
|
intel_pmu_pebs_data_source_skl(pmem);
|
2019-03-06 05:23:18 +08:00
|
|
|
|
|
|
|
if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
|
|
|
|
x86_pmu.flags |= PMU_FL_TFA;
|
|
|
|
x86_pmu.get_event_constraints = tfa_get_event_constraints;
|
|
|
|
x86_pmu.enable_all = intel_tfa_pmu_enable_all;
|
|
|
|
x86_pmu.commit_scheduling = intel_tfa_commit_scheduling;
|
|
|
|
}
|
|
|
|
|
2015-05-11 03:22:44 +08:00
|
|
|
pr_cont("Skylake events, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "skylake";
|
2015-05-11 03:22:44 +08:00
|
|
|
break;
|
|
|
|
|
2019-06-03 21:41:21 +08:00
|
|
|
case INTEL_FAM6_ICELAKE_X:
|
2019-08-28 03:48:24 +08:00
|
|
|
case INTEL_FAM6_ICELAKE_D:
|
2019-06-03 21:41:21 +08:00
|
|
|
pmem = true;
|
2019-06-25 00:19:13 +08:00
|
|
|
/* fall through */
|
2019-08-28 03:48:22 +08:00
|
|
|
case INTEL_FAM6_ICELAKE_L:
|
2019-08-28 03:48:21 +08:00
|
|
|
case INTEL_FAM6_ICELAKE:
|
2019-10-08 23:50:08 +08:00
|
|
|
case INTEL_FAM6_TIGERLAKE_L:
|
|
|
|
case INTEL_FAM6_TIGERLAKE:
|
2019-04-03 03:45:05 +08:00
|
|
|
x86_pmu.late_ack = true;
|
|
|
|
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
|
|
|
hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
|
|
|
|
intel_pmu_lbr_init_skl();
|
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_icl_event_constraints;
|
|
|
|
x86_pmu.pebs_constraints = intel_icl_pebs_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_icl_extra_regs;
|
|
|
|
x86_pmu.pebs_aliases = NULL;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
|
|
|
|
|
|
|
x86_pmu.hw_config = hsw_hw_config;
|
|
|
|
x86_pmu.get_event_constraints = icl_get_event_constraints;
|
|
|
|
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
|
|
|
|
hsw_format_attr : nhm_format_attr;
|
2019-05-12 23:55:17 +08:00
|
|
|
extra_skl_attr = skl_format_attr;
|
2019-05-12 23:55:13 +08:00
|
|
|
mem_attr = icl_events_attrs;
|
|
|
|
tsx_attr = icl_tsx_events_attrs;
|
2019-04-03 03:45:05 +08:00
|
|
|
x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02);
|
|
|
|
x86_pmu.lbr_pt_coexist = true;
|
2019-06-03 21:41:21 +08:00
|
|
|
intel_pmu_pebs_data_source_skl(pmem);
|
2019-04-03 03:45:05 +08:00
|
|
|
pr_cont("Icelake events, ");
|
|
|
|
name = "icelake";
|
|
|
|
break;
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
default:
|
2011-06-29 23:42:36 +08:00
|
|
|
switch (x86_pmu.version) {
|
|
|
|
case 1:
|
|
|
|
x86_pmu.event_constraints = intel_v1_event_constraints;
|
|
|
|
pr_cont("generic architected perfmon v1, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "generic_arch_v1";
|
2011-06-29 23:42:36 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
/*
|
|
|
|
* default constraints for v2 and up
|
|
|
|
*/
|
|
|
|
x86_pmu.event_constraints = intel_gen_event_constraints;
|
|
|
|
pr_cont("generic architected perfmon, ");
|
2017-08-23 02:52:01 +08:00
|
|
|
name = "generic_arch_v2+";
|
2011-06-29 23:42:36 +08:00
|
|
|
break;
|
|
|
|
}
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
2011-11-10 20:57:26 +08:00
|
|
|
|
2018-10-28 20:58:28 +08:00
|
|
|
snprintf(pmu_name_str, sizeof(pmu_name_str), "%s", name);
|
2017-08-23 02:52:01 +08:00
|
|
|
|
2017-08-23 02:52:00 +08:00
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
group_events_td.attrs = td_attr;
|
|
|
|
group_events_mem.attrs = mem_attr;
|
|
|
|
group_events_tsx.attrs = tsx_attr;
|
2019-05-12 23:55:16 +08:00
|
|
|
group_format_extra.attrs = extra_attr;
|
2019-05-12 23:55:17 +08:00
|
|
|
group_format_extra_skl.attrs = extra_skl_attr;
|
2019-05-12 23:55:13 +08:00
|
|
|
|
|
|
|
x86_pmu.attr_update = attr_update;
|
2018-09-06 21:57:48 +08:00
|
|
|
|
2012-06-21 02:46:34 +08:00
|
|
|
if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
|
|
|
|
WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
|
|
|
|
x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
|
|
|
|
x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
|
|
|
|
}
|
2017-01-11 19:43:10 +08:00
|
|
|
x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1;
|
2012-06-21 02:46:34 +08:00
|
|
|
|
|
|
|
if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
|
|
|
|
WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
|
|
|
|
x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
|
|
|
|
x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
|
|
|
|
}
|
|
|
|
|
|
|
|
x86_pmu.intel_ctrl |=
|
|
|
|
((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
|
|
|
|
|
|
|
|
if (x86_pmu.event_constraints) {
|
|
|
|
/*
|
|
|
|
* event on fixed counter2 (REF_CYCLES) only works on this
|
|
|
|
* counter, so do not extend mask to generic counters
|
|
|
|
*/
|
|
|
|
for_each_event_constraint(c, x86_pmu.event_constraints) {
|
2015-06-08 20:46:49 +08:00
|
|
|
if (c->cmask == FIXED_EVENT_FLAGS
|
|
|
|
&& c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
|
|
|
|
c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
|
2012-06-21 02:46:34 +08:00
|
|
|
}
|
2015-06-08 20:46:49 +08:00
|
|
|
c->idxmsk64 &=
|
2016-05-11 21:51:51 +08:00
|
|
|
~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
|
2015-06-08 20:46:49 +08:00
|
|
|
c->weight = hweight64(c->idxmsk64);
|
2012-06-21 02:46:34 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-15 03:25:56 +08:00
|
|
|
/*
|
|
|
|
* Access LBR MSR may cause #GP under certain circumstances.
|
|
|
|
* E.g. KVM doesn't support LBR MSR
|
|
|
|
* Check all LBT MSR here.
|
|
|
|
* Disable LBR access if any LBR MSRs can not be accessed.
|
|
|
|
*/
|
|
|
|
if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
|
|
|
|
x86_pmu.lbr_nr = 0;
|
|
|
|
for (i = 0; i < x86_pmu.lbr_nr; i++) {
|
|
|
|
if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
|
|
|
|
check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
|
|
|
|
x86_pmu.lbr_nr = 0;
|
|
|
|
}
|
|
|
|
|
2019-05-12 23:55:15 +08:00
|
|
|
if (x86_pmu.lbr_nr)
|
2016-06-22 02:31:10 +08:00
|
|
|
pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
|
2017-08-23 02:52:01 +08:00
|
|
|
|
2014-07-15 03:25:56 +08:00
|
|
|
/*
|
|
|
|
* Access extra MSR may cause #GP under certain circumstances.
|
|
|
|
* E.g. KVM doesn't support offcore event
|
|
|
|
* Check all extra_regs here.
|
|
|
|
*/
|
|
|
|
if (x86_pmu.extra_regs) {
|
|
|
|
for (er = x86_pmu.extra_regs; er->msr; er++) {
|
2015-07-01 07:33:24 +08:00
|
|
|
er->extra_msr_access = check_msr(er->msr, 0x11UL);
|
2014-07-15 03:25:56 +08:00
|
|
|
/* Disable LBR select mapping */
|
|
|
|
if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
|
|
|
|
x86_pmu.lbr_sel_map = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-06-25 23:12:33 +08:00
|
|
|
/* Support full width counters using alternative MSR range */
|
|
|
|
if (x86_pmu.intel_cap.full_width_write) {
|
2016-11-30 04:33:28 +08:00
|
|
|
x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
|
2013-06-25 23:12:33 +08:00
|
|
|
x86_pmu.perfctr = MSR_IA32_PMC0;
|
|
|
|
pr_cont("full-width counters, ");
|
|
|
|
}
|
|
|
|
|
2018-08-08 15:12:07 +08:00
|
|
|
/*
|
|
|
|
* For arch perfmon 4 use counter freezing to avoid
|
|
|
|
* several MSR accesses in the PMI.
|
|
|
|
*/
|
|
|
|
if (x86_pmu.counter_freezing)
|
|
|
|
x86_pmu.handle_irq = intel_pmu_handle_irq_v4;
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2014-11-18 03:07:04 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* HT bug: phase 2 init
|
|
|
|
* Called once we have valid topology information to check
|
|
|
|
* whether or not HT is enabled
|
|
|
|
* If HT is off, then we disable the workaround
|
|
|
|
*/
|
|
|
|
static __init int fixup_ht_bug(void)
|
|
|
|
{
|
2016-05-20 08:09:59 +08:00
|
|
|
int c;
|
2014-11-18 03:07:04 +08:00
|
|
|
/*
|
|
|
|
* problem not present on this CPU model, nothing to do
|
|
|
|
*/
|
|
|
|
if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
|
|
|
|
return 0;
|
|
|
|
|
2016-05-20 08:09:59 +08:00
|
|
|
if (topology_max_smt_threads() > 1) {
|
2014-11-18 03:07:04 +08:00
|
|
|
pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-09-13 03:36:56 +08:00
|
|
|
cpus_read_lock();
|
|
|
|
|
|
|
|
hardlockup_detector_perf_stop();
|
2014-11-18 03:07:04 +08:00
|
|
|
|
|
|
|
x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
|
|
|
|
|
|
|
|
x86_pmu.start_scheduling = NULL;
|
2015-05-21 16:57:32 +08:00
|
|
|
x86_pmu.commit_scheduling = NULL;
|
2014-11-18 03:07:04 +08:00
|
|
|
x86_pmu.stop_scheduling = NULL;
|
|
|
|
|
2017-09-13 03:36:56 +08:00
|
|
|
hardlockup_detector_perf_restart();
|
2014-11-18 03:07:04 +08:00
|
|
|
|
2017-05-24 16:15:30 +08:00
|
|
|
for_each_online_cpu(c)
|
2019-03-06 05:23:15 +08:00
|
|
|
free_excl_cntrs(&per_cpu(cpu_hw_events, c));
|
2014-11-18 03:07:04 +08:00
|
|
|
|
2017-05-24 16:15:30 +08:00
|
|
|
cpus_read_unlock();
|
2014-11-18 03:07:04 +08:00
|
|
|
pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
subsys_initcall(fixup_ht_bug)
|