2011-03-03 10:34:47 +08:00
|
|
|
/*
|
2011-06-06 22:57:03 +08:00
|
|
|
* Per core/cpu state
|
|
|
|
*
|
|
|
|
* Used to coordinate shared registers between HT threads or
|
|
|
|
* among events on a single PMU.
|
2011-03-03 10:34:47 +08:00
|
|
|
*/
|
2011-08-31 07:41:05 +08:00
|
|
|
|
2012-05-22 10:50:07 +08:00
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
#include <linux/stddef.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/slab.h>
|
2011-05-27 00:22:53 +08:00
|
|
|
#include <linux/export.h>
|
2015-09-05 06:45:12 +08:00
|
|
|
#include <linux/nmi.h>
|
2011-08-31 07:41:05 +08:00
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
#include <asm/cpufeature.h>
|
2011-08-31 07:41:05 +08:00
|
|
|
#include <asm/hardirq.h>
|
|
|
|
#include <asm/apic.h>
|
|
|
|
|
|
|
|
#include "perf_event.h"
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
2010-02-01 22:36:30 +08:00
|
|
|
* Intel PerfMon, used on Core and later.
|
2010-02-26 19:05:05 +08:00
|
|
|
*/
|
2011-04-27 17:51:41 +08:00
|
|
|
static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2012-07-06 17:59:46 +08:00
|
|
|
[PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
|
|
|
|
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
|
|
|
|
[PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
|
|
|
|
[PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
|
|
|
|
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
|
|
|
|
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
|
|
|
|
[PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
|
|
|
|
[PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
|
2010-02-26 19:05:05 +08:00
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_core_event_constraints[] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_core2_event_constraints[] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-02-01 22:36:30 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
2011-12-11 07:28:51 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2010-02-26 19:05:05 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
|
2010-02-01 22:36:30 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
|
2010-02-26 19:05:05 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-02-01 22:36:30 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
2011-12-11 07:28:51 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2010-02-26 19:05:05 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
|
2011-03-03 10:34:47 +08:00
|
|
|
{
|
2013-07-18 17:02:23 +08:00
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
|
2013-01-24 23:10:32 +08:00
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
|
2011-03-03 10:34:47 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-02-01 22:36:30 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
2011-12-11 07:28:51 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2010-02-26 19:05:05 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
|
2010-06-10 19:25:01 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
|
2010-02-26 19:05:05 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_snb_event_constraints[] __read_mostly =
|
2011-03-02 21:27:04 +08:00
|
|
|
{
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
2011-12-11 07:28:51 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2013-03-17 21:49:57 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
|
2011-03-02 21:27:04 +08:00
|
|
|
INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
|
2013-03-09 07:22:48 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
|
2014-11-18 03:06:59 +08:00
|
|
|
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
|
|
|
|
|
2011-03-02 21:27:04 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2013-02-20 18:15:12 +08:00
|
|
|
static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
|
|
|
|
{
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
|
2013-09-11 23:22:22 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
|
2013-02-20 18:15:12 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
|
2014-11-18 03:06:59 +08:00
|
|
|
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
|
|
|
|
|
2013-02-20 18:15:12 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
|
2011-03-03 10:34:47 +08:00
|
|
|
{
|
2013-07-18 17:02:23 +08:00
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
|
2013-01-24 23:10:32 +08:00
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
|
2011-03-03 10:34:47 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2011-06-29 23:42:36 +08:00
|
|
|
static struct event_constraint intel_v1_event_constraints[] __read_mostly =
|
|
|
|
{
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2011-04-27 18:02:04 +08:00
|
|
|
static struct event_constraint intel_gen_event_constraints[] __read_mostly =
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-02-01 22:36:30 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
2011-12-11 07:28:51 +08:00
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2010-02-26 19:05:05 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2013-07-18 17:02:24 +08:00
|
|
|
static struct event_constraint intel_slm_event_constraints[] __read_mostly =
|
|
|
|
{
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2015-05-11 03:22:44 +08:00
|
|
|
struct event_constraint intel_skl_event_constraints[] = {
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
|
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2015-12-08 06:28:18 +08:00
|
|
|
static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7,
|
|
|
|
MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x02b7,
|
|
|
|
MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
|
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2011-06-06 22:57:12 +08:00
|
|
|
static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
|
2013-07-18 17:02:23 +08:00
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
|
2013-01-24 23:10:32 +08:00
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
|
2013-04-16 19:51:43 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
|
2013-07-18 17:02:23 +08:00
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
|
2013-06-08 05:22:10 +08:00
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
|
2011-06-06 22:57:12 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2015-05-11 03:22:44 +08:00
|
|
|
static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
|
|
|
|
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
|
2015-09-10 05:54:00 +08:00
|
|
|
/*
|
|
|
|
* Note the low 8 bits eventsel code is not a continuous field, containing
|
|
|
|
* some #GPing bits. These are masked out.
|
|
|
|
*/
|
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
|
2015-05-11 03:22:44 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
2013-09-13 01:17:00 +08:00
|
|
|
EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
|
|
|
|
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
|
|
|
|
EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
|
2013-01-24 23:10:32 +08:00
|
|
|
|
|
|
|
struct attribute *nhm_events_attrs[] = {
|
|
|
|
EVENT_PTR(mem_ld_nhm),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct attribute *snb_events_attrs[] = {
|
|
|
|
EVENT_PTR(mem_ld_snb),
|
2013-01-24 23:10:34 +08:00
|
|
|
EVENT_PTR(mem_st_snb),
|
2013-01-24 23:10:32 +08:00
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
static struct event_constraint intel_hsw_event_constraints[] = {
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
2015-11-25 01:05:01 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
|
2013-06-18 08:36:48 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
|
|
|
|
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
|
|
|
|
/* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
|
2015-03-10 02:20:22 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
|
2013-06-18 08:36:48 +08:00
|
|
|
/* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
|
2015-03-10 02:20:22 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
|
2013-06-18 08:36:48 +08:00
|
|
|
/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
|
2015-03-10 02:20:22 +08:00
|
|
|
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
|
2014-11-18 03:06:59 +08:00
|
|
|
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
|
|
|
|
INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
|
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2015-02-18 10:18:05 +08:00
|
|
|
struct event_constraint intel_bdw_event_constraints[] = {
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
|
|
|
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
|
|
|
INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
|
2015-11-17 08:21:07 +08:00
|
|
|
INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
|
2015-02-18 10:18:05 +08:00
|
|
|
EVENT_CONSTRAINT_END
|
|
|
|
};
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
static u64 intel_pmu_event_map(int hw_event)
|
|
|
|
{
|
|
|
|
return intel_perfmon_event_map[hw_event];
|
|
|
|
}
|
|
|
|
|
2015-05-11 03:22:44 +08:00
|
|
|
/*
|
|
|
|
* Notes on the events:
|
|
|
|
* - data reads do not include code reads (comparable to earlier tables)
|
|
|
|
* - data counts include speculative execution (except L1 write, dtlb, bpu)
|
|
|
|
* - remote node access includes remote memory, remote cache, remote mmio.
|
|
|
|
* - prefetches are not included in the counts.
|
|
|
|
* - icache miss does not include decoded icache
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define SKL_DEMAND_DATA_RD BIT_ULL(0)
|
|
|
|
#define SKL_DEMAND_RFO BIT_ULL(1)
|
|
|
|
#define SKL_ANY_RESPONSE BIT_ULL(16)
|
|
|
|
#define SKL_SUPPLIER_NONE BIT_ULL(17)
|
|
|
|
#define SKL_L3_MISS_LOCAL_DRAM BIT_ULL(26)
|
|
|
|
#define SKL_L3_MISS_REMOTE_HOP0_DRAM BIT_ULL(27)
|
|
|
|
#define SKL_L3_MISS_REMOTE_HOP1_DRAM BIT_ULL(28)
|
|
|
|
#define SKL_L3_MISS_REMOTE_HOP2P_DRAM BIT_ULL(29)
|
|
|
|
#define SKL_L3_MISS (SKL_L3_MISS_LOCAL_DRAM| \
|
|
|
|
SKL_L3_MISS_REMOTE_HOP0_DRAM| \
|
|
|
|
SKL_L3_MISS_REMOTE_HOP1_DRAM| \
|
|
|
|
SKL_L3_MISS_REMOTE_HOP2P_DRAM)
|
|
|
|
#define SKL_SPL_HIT BIT_ULL(30)
|
|
|
|
#define SKL_SNOOP_NONE BIT_ULL(31)
|
|
|
|
#define SKL_SNOOP_NOT_NEEDED BIT_ULL(32)
|
|
|
|
#define SKL_SNOOP_MISS BIT_ULL(33)
|
|
|
|
#define SKL_SNOOP_HIT_NO_FWD BIT_ULL(34)
|
|
|
|
#define SKL_SNOOP_HIT_WITH_FWD BIT_ULL(35)
|
|
|
|
#define SKL_SNOOP_HITM BIT_ULL(36)
|
|
|
|
#define SKL_SNOOP_NON_DRAM BIT_ULL(37)
|
|
|
|
#define SKL_ANY_SNOOP (SKL_SPL_HIT|SKL_SNOOP_NONE| \
|
|
|
|
SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
|
|
|
|
SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
|
|
|
|
SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
|
|
|
|
#define SKL_DEMAND_READ SKL_DEMAND_DATA_RD
|
|
|
|
#define SKL_SNOOP_DRAM (SKL_SNOOP_NONE| \
|
|
|
|
SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
|
|
|
|
SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
|
|
|
|
SKL_SNOOP_HITM|SKL_SPL_HIT)
|
|
|
|
#define SKL_DEMAND_WRITE SKL_DEMAND_RFO
|
|
|
|
#define SKL_LLC_ACCESS SKL_ANY_RESPONSE
|
|
|
|
#define SKL_L3_MISS_REMOTE (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
|
|
|
|
SKL_L3_MISS_REMOTE_HOP1_DRAM| \
|
|
|
|
SKL_L3_MISS_REMOTE_HOP2P_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 skl_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x283, /* ICACHE_64B.MISS */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x2085, /* ITLB_MISSES.STLB_HIT */
|
|
|
|
[ C(RESULT_MISS) ] = 0xe85, /* ITLB_MISSES.WALK_COMPLETED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static __initconst const u64 skl_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
|
|
|
|
SKL_LLC_ACCESS|SKL_ANY_SNOOP,
|
|
|
|
[ C(RESULT_MISS) ] = SKL_DEMAND_READ|
|
|
|
|
SKL_L3_MISS|SKL_ANY_SNOOP|
|
|
|
|
SKL_SUPPLIER_NONE,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
|
|
|
|
SKL_LLC_ACCESS|SKL_ANY_SNOOP,
|
|
|
|
[ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
|
|
|
|
SKL_L3_MISS|SKL_ANY_SNOOP|
|
|
|
|
SKL_SUPPLIER_NONE,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
|
|
|
|
SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
|
|
|
|
[ C(RESULT_MISS) ] = SKL_DEMAND_READ|
|
|
|
|
SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
|
|
|
|
SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
|
|
|
|
[ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
|
|
|
|
SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2012-07-17 17:27:55 +08:00
|
|
|
#define SNB_DMND_DATA_RD (1ULL << 0)
|
|
|
|
#define SNB_DMND_RFO (1ULL << 1)
|
|
|
|
#define SNB_DMND_IFETCH (1ULL << 2)
|
|
|
|
#define SNB_DMND_WB (1ULL << 3)
|
|
|
|
#define SNB_PF_DATA_RD (1ULL << 4)
|
|
|
|
#define SNB_PF_RFO (1ULL << 5)
|
|
|
|
#define SNB_PF_IFETCH (1ULL << 6)
|
|
|
|
#define SNB_LLC_DATA_RD (1ULL << 7)
|
|
|
|
#define SNB_LLC_RFO (1ULL << 8)
|
|
|
|
#define SNB_LLC_IFETCH (1ULL << 9)
|
|
|
|
#define SNB_BUS_LOCKS (1ULL << 10)
|
|
|
|
#define SNB_STRM_ST (1ULL << 11)
|
|
|
|
#define SNB_OTHER (1ULL << 15)
|
|
|
|
#define SNB_RESP_ANY (1ULL << 16)
|
|
|
|
#define SNB_NO_SUPP (1ULL << 17)
|
|
|
|
#define SNB_LLC_HITM (1ULL << 18)
|
|
|
|
#define SNB_LLC_HITE (1ULL << 19)
|
|
|
|
#define SNB_LLC_HITS (1ULL << 20)
|
|
|
|
#define SNB_LLC_HITF (1ULL << 21)
|
|
|
|
#define SNB_LOCAL (1ULL << 22)
|
|
|
|
#define SNB_REMOTE (0xffULL << 23)
|
|
|
|
#define SNB_SNP_NONE (1ULL << 31)
|
|
|
|
#define SNB_SNP_NOT_NEEDED (1ULL << 32)
|
|
|
|
#define SNB_SNP_MISS (1ULL << 33)
|
|
|
|
#define SNB_NO_FWD (1ULL << 34)
|
|
|
|
#define SNB_SNP_FWD (1ULL << 35)
|
|
|
|
#define SNB_HITM (1ULL << 36)
|
|
|
|
#define SNB_NON_DRAM (1ULL << 37)
|
|
|
|
|
|
|
|
#define SNB_DMND_READ (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
|
|
|
|
#define SNB_DMND_WRITE (SNB_DMND_RFO|SNB_LLC_RFO)
|
|
|
|
#define SNB_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
|
|
|
|
|
|
|
|
#define SNB_SNP_ANY (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
|
|
|
|
SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
|
|
|
|
SNB_HITM)
|
|
|
|
|
|
|
|
#define SNB_DRAM_ANY (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
|
|
|
|
#define SNB_DRAM_REMOTE (SNB_REMOTE|SNB_SNP_ANY)
|
|
|
|
|
|
|
|
#define SNB_L3_ACCESS SNB_RESP_ANY
|
|
|
|
#define SNB_L3_MISS (SNB_DRAM_ANY|SNB_NON_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 snb_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_L3_MISS,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_L3_MISS,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
|
|
|
|
[ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2011-03-02 21:27:04 +08:00
|
|
|
static __initconst const u64 snb_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
|
2011-03-02 21:27:04 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-03-02 21:27:04 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
|
2011-03-02 21:27:04 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-03-02 21:27:04 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
|
2011-03-02 21:27:04 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-03-02 21:27:04 +08:00
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
2011-04-23 05:37:06 +08:00
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
2012-07-17 17:27:55 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
2012-07-17 17:27:55 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
2012-07-17 17:27:55 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
},
|
|
|
|
|
2011-03-02 21:27:04 +08:00
|
|
|
};
|
|
|
|
|
2015-02-18 10:18:04 +08:00
|
|
|
/*
|
|
|
|
* Notes on the events:
|
|
|
|
* - data reads do not include code reads (comparable to earlier tables)
|
|
|
|
* - data counts include speculative execution (except L1 write, dtlb, bpu)
|
|
|
|
* - remote node access includes remote memory, remote cache, remote mmio.
|
|
|
|
* - prefetches are not included in the counts because they are not
|
|
|
|
* reliably counted.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define HSW_DEMAND_DATA_RD BIT_ULL(0)
|
|
|
|
#define HSW_DEMAND_RFO BIT_ULL(1)
|
|
|
|
#define HSW_ANY_RESPONSE BIT_ULL(16)
|
|
|
|
#define HSW_SUPPLIER_NONE BIT_ULL(17)
|
|
|
|
#define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22)
|
|
|
|
#define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27)
|
|
|
|
#define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28)
|
|
|
|
#define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29)
|
|
|
|
#define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \
|
|
|
|
HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
|
|
|
|
HSW_L3_MISS_REMOTE_HOP2P)
|
|
|
|
#define HSW_SNOOP_NONE BIT_ULL(31)
|
|
|
|
#define HSW_SNOOP_NOT_NEEDED BIT_ULL(32)
|
|
|
|
#define HSW_SNOOP_MISS BIT_ULL(33)
|
|
|
|
#define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34)
|
|
|
|
#define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35)
|
|
|
|
#define HSW_SNOOP_HITM BIT_ULL(36)
|
|
|
|
#define HSW_SNOOP_NON_DRAM BIT_ULL(37)
|
|
|
|
#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \
|
|
|
|
HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
|
|
|
|
HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
|
|
|
|
HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
|
|
|
|
#define HSW_SNOOP_DRAM (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
|
|
|
|
#define HSW_DEMAND_READ HSW_DEMAND_DATA_RD
|
|
|
|
#define HSW_DEMAND_WRITE HSW_DEMAND_RFO
|
|
|
|
#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\
|
|
|
|
HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
|
|
|
|
#define HSW_LLC_ACCESS HSW_ANY_RESPONSE
|
|
|
|
|
2015-02-18 10:18:05 +08:00
|
|
|
#define BDW_L3_MISS_LOCAL BIT(26)
|
|
|
|
#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \
|
|
|
|
HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
|
|
|
|
HSW_L3_MISS_REMOTE_HOP2P)
|
|
|
|
|
|
|
|
|
2015-02-18 10:18:04 +08:00
|
|
|
static __initconst const u64 hsw_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */
|
|
|
|
[ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static __initconst const u64 hsw_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
|
|
|
|
HSW_LLC_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = HSW_DEMAND_READ|
|
|
|
|
HSW_L3_MISS|HSW_ANY_SNOOP,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
|
|
|
|
HSW_LLC_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
|
|
|
|
HSW_L3_MISS|HSW_ANY_SNOOP,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
|
|
|
|
HSW_L3_MISS_LOCAL_DRAM|
|
|
|
|
HSW_SNOOP_DRAM,
|
|
|
|
[ C(RESULT_MISS) ] = HSW_DEMAND_READ|
|
|
|
|
HSW_L3_MISS_REMOTE|
|
|
|
|
HSW_SNOOP_DRAM,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
|
|
|
|
HSW_L3_MISS_LOCAL_DRAM|
|
|
|
|
HSW_SNOOP_DRAM,
|
|
|
|
[ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
|
|
|
|
HSW_L3_MISS_REMOTE|
|
|
|
|
HSW_SNOOP_DRAM,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2010-03-29 19:09:53 +08:00
|
|
|
static __initconst const u64 westmere_hw_cache_event_ids
|
2010-02-26 19:05:05 +08:00
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
|
2011-03-03 10:34:48 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
2011-03-03 10:34:48 +08:00
|
|
|
/*
|
|
|
|
* Use RFO, not WRITEBACK, because a write miss would typically occur
|
|
|
|
* on RFO.
|
|
|
|
*/
|
2010-02-26 19:05:05 +08:00
|
|
|
[ C(OP_WRITE) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
|
2011-03-03 10:34:48 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
|
2011-03-03 10:34:48 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2011-04-23 06:57:42 +08:00
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
2011-04-23 05:37:06 +08:00
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
},
|
2010-02-26 19:05:05 +08:00
|
|
|
};
|
|
|
|
|
2011-03-03 10:34:48 +08:00
|
|
|
/*
|
2011-04-23 06:57:42 +08:00
|
|
|
* Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
|
|
|
|
* See IA32 SDM Vol 3B 30.6.1.3
|
2011-03-03 10:34:48 +08:00
|
|
|
*/
|
|
|
|
|
2011-04-23 06:57:42 +08:00
|
|
|
#define NHM_DMND_DATA_RD (1 << 0)
|
|
|
|
#define NHM_DMND_RFO (1 << 1)
|
|
|
|
#define NHM_DMND_IFETCH (1 << 2)
|
|
|
|
#define NHM_DMND_WB (1 << 3)
|
|
|
|
#define NHM_PF_DATA_RD (1 << 4)
|
|
|
|
#define NHM_PF_DATA_RFO (1 << 5)
|
|
|
|
#define NHM_PF_IFETCH (1 << 6)
|
|
|
|
#define NHM_OFFCORE_OTHER (1 << 7)
|
|
|
|
#define NHM_UNCORE_HIT (1 << 8)
|
|
|
|
#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
|
|
|
|
#define NHM_OTHER_CORE_HITM (1 << 10)
|
|
|
|
/* reserved */
|
|
|
|
#define NHM_REMOTE_CACHE_FWD (1 << 12)
|
|
|
|
#define NHM_REMOTE_DRAM (1 << 13)
|
|
|
|
#define NHM_LOCAL_DRAM (1 << 14)
|
|
|
|
#define NHM_NON_DRAM (1 << 15)
|
|
|
|
|
perf/x86: Fix local vs remote memory events for NHM/WSM
Verified using the below proglet.. before:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,101,554 node-stores
2,096,931 node-store-misses
5.021546079 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
501,137 node-stores
199 node-store-misses
5.124451068 seconds time elapsed
After:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,107,516 node-stores
2,097,187 node-store-misses
5.012755149 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
2,063,355 node-stores
165 node-store-misses
5.082091494 seconds time elapsed
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>
#define SIZE (32*1024*1024)
volatile int done;
void sig_done(int sig)
{
done = 1;
}
int main(int argc, char **argv)
{
cpu_set_t *mask, *mask2;
size_t size;
int i, err, t;
int nrcpus = 1024;
char *mem;
unsigned long nodemask = 0x01; /* node 0 */
DIR *node;
struct dirent *de;
int read = 0;
int local = 0;
if (argc < 2) {
printf("usage: %s [0-3]\n", argv[0]);
printf(" bit0 - local/remote\n");
printf(" bit1 - read/write\n");
exit(0);
}
switch (atoi(argv[1])) {
case 0:
printf("remote write\n");
break;
case 1:
printf("local write\n");
local = 1;
break;
case 2:
printf("remote read\n");
read = 1;
break;
case 3:
printf("local read\n");
local = 1;
read = 1;
break;
}
mask = CPU_ALLOC(nrcpus);
size = CPU_ALLOC_SIZE(nrcpus);
CPU_ZERO_S(size, mask);
node = opendir("/sys/devices/system/node/node0/");
if (!node)
perror("opendir");
while ((de = readdir(node))) {
int cpu;
if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
CPU_SET_S(cpu, size, mask);
}
closedir(node);
mask2 = CPU_ALLOC(nrcpus);
CPU_ZERO_S(size, mask2);
for (i = 0; i < size; i++)
CPU_SET_S(i, size, mask2);
CPU_XOR_S(size, mask2, mask2, mask); // invert
if (!local)
mask = mask2;
err = sched_setaffinity(0, size, mask);
if (err)
perror("sched_setaffinity");
mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
if (err)
perror("mbind");
signal(SIGALRM, sig_done);
alarm(5);
if (!read) {
while (!done) {
for (i = 0; i < SIZE; i++)
mem[i] = 0x01;
}
} else {
while (!done) {
for (i = 0; i < SIZE; i++)
t += *(volatile char *)(mem + i);
}
}
return 0;
}
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-06 06:59:25 +08:00
|
|
|
#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
|
|
|
|
#define NHM_REMOTE (NHM_REMOTE_DRAM)
|
2011-04-23 06:57:42 +08:00
|
|
|
|
|
|
|
#define NHM_DMND_READ (NHM_DMND_DATA_RD)
|
|
|
|
#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
|
|
|
|
#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
|
|
|
|
|
|
|
|
#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
|
perf/x86: Fix local vs remote memory events for NHM/WSM
Verified using the below proglet.. before:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,101,554 node-stores
2,096,931 node-store-misses
5.021546079 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
501,137 node-stores
199 node-store-misses
5.124451068 seconds time elapsed
After:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,107,516 node-stores
2,097,187 node-store-misses
5.012755149 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
2,063,355 node-stores
165 node-store-misses
5.082091494 seconds time elapsed
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>
#define SIZE (32*1024*1024)
volatile int done;
void sig_done(int sig)
{
done = 1;
}
int main(int argc, char **argv)
{
cpu_set_t *mask, *mask2;
size_t size;
int i, err, t;
int nrcpus = 1024;
char *mem;
unsigned long nodemask = 0x01; /* node 0 */
DIR *node;
struct dirent *de;
int read = 0;
int local = 0;
if (argc < 2) {
printf("usage: %s [0-3]\n", argv[0]);
printf(" bit0 - local/remote\n");
printf(" bit1 - read/write\n");
exit(0);
}
switch (atoi(argv[1])) {
case 0:
printf("remote write\n");
break;
case 1:
printf("local write\n");
local = 1;
break;
case 2:
printf("remote read\n");
read = 1;
break;
case 3:
printf("local read\n");
local = 1;
read = 1;
break;
}
mask = CPU_ALLOC(nrcpus);
size = CPU_ALLOC_SIZE(nrcpus);
CPU_ZERO_S(size, mask);
node = opendir("/sys/devices/system/node/node0/");
if (!node)
perror("opendir");
while ((de = readdir(node))) {
int cpu;
if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
CPU_SET_S(cpu, size, mask);
}
closedir(node);
mask2 = CPU_ALLOC(nrcpus);
CPU_ZERO_S(size, mask2);
for (i = 0; i < size; i++)
CPU_SET_S(i, size, mask2);
CPU_XOR_S(size, mask2, mask2, mask); // invert
if (!local)
mask = mask2;
err = sched_setaffinity(0, size, mask);
if (err)
perror("sched_setaffinity");
mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
if (err)
perror("mbind");
signal(SIGALRM, sig_done);
alarm(5);
if (!read) {
while (!done) {
for (i = 0; i < SIZE; i++)
mem[i] = 0x01;
}
} else {
while (!done) {
for (i = 0; i < SIZE; i++)
t += *(volatile char *)(mem + i);
}
}
return 0;
}
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-06 06:59:25 +08:00
|
|
|
#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
|
2011-04-23 06:57:42 +08:00
|
|
|
#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
|
2011-03-03 10:34:48 +08:00
|
|
|
|
|
|
|
static __initconst const u64 nehalem_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS,
|
2011-03-03 10:34:48 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS,
|
2011-03-03 10:34:48 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
2011-04-23 06:57:42 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
|
2011-03-03 10:34:48 +08:00
|
|
|
},
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
perf/x86: Fix local vs remote memory events for NHM/WSM
Verified using the below proglet.. before:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,101,554 node-stores
2,096,931 node-store-misses
5.021546079 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
501,137 node-stores
199 node-store-misses
5.124451068 seconds time elapsed
After:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,107,516 node-stores
2,097,187 node-store-misses
5.012755149 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
2,063,355 node-stores
165 node-store-misses
5.082091494 seconds time elapsed
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>
#define SIZE (32*1024*1024)
volatile int done;
void sig_done(int sig)
{
done = 1;
}
int main(int argc, char **argv)
{
cpu_set_t *mask, *mask2;
size_t size;
int i, err, t;
int nrcpus = 1024;
char *mem;
unsigned long nodemask = 0x01; /* node 0 */
DIR *node;
struct dirent *de;
int read = 0;
int local = 0;
if (argc < 2) {
printf("usage: %s [0-3]\n", argv[0]);
printf(" bit0 - local/remote\n");
printf(" bit1 - read/write\n");
exit(0);
}
switch (atoi(argv[1])) {
case 0:
printf("remote write\n");
break;
case 1:
printf("local write\n");
local = 1;
break;
case 2:
printf("remote read\n");
read = 1;
break;
case 3:
printf("local read\n");
local = 1;
read = 1;
break;
}
mask = CPU_ALLOC(nrcpus);
size = CPU_ALLOC_SIZE(nrcpus);
CPU_ZERO_S(size, mask);
node = opendir("/sys/devices/system/node/node0/");
if (!node)
perror("opendir");
while ((de = readdir(node))) {
int cpu;
if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
CPU_SET_S(cpu, size, mask);
}
closedir(node);
mask2 = CPU_ALLOC(nrcpus);
CPU_ZERO_S(size, mask2);
for (i = 0; i < size; i++)
CPU_SET_S(i, size, mask2);
CPU_XOR_S(size, mask2, mask2, mask); // invert
if (!local)
mask = mask2;
err = sched_setaffinity(0, size, mask);
if (err)
perror("sched_setaffinity");
mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
if (err)
perror("mbind");
signal(SIGALRM, sig_done);
alarm(5);
if (!read) {
while (!done) {
for (i = 0; i < SIZE; i++)
mem[i] = 0x01;
}
} else {
while (!done) {
for (i = 0; i < SIZE; i++)
t += *(volatile char *)(mem + i);
}
}
return 0;
}
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-06 06:59:25 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
perf/x86: Fix local vs remote memory events for NHM/WSM
Verified using the below proglet.. before:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,101,554 node-stores
2,096,931 node-store-misses
5.021546079 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
501,137 node-stores
199 node-store-misses
5.124451068 seconds time elapsed
After:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,107,516 node-stores
2,097,187 node-store-misses
5.012755149 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
2,063,355 node-stores
165 node-store-misses
5.082091494 seconds time elapsed
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>
#define SIZE (32*1024*1024)
volatile int done;
void sig_done(int sig)
{
done = 1;
}
int main(int argc, char **argv)
{
cpu_set_t *mask, *mask2;
size_t size;
int i, err, t;
int nrcpus = 1024;
char *mem;
unsigned long nodemask = 0x01; /* node 0 */
DIR *node;
struct dirent *de;
int read = 0;
int local = 0;
if (argc < 2) {
printf("usage: %s [0-3]\n", argv[0]);
printf(" bit0 - local/remote\n");
printf(" bit1 - read/write\n");
exit(0);
}
switch (atoi(argv[1])) {
case 0:
printf("remote write\n");
break;
case 1:
printf("local write\n");
local = 1;
break;
case 2:
printf("remote read\n");
read = 1;
break;
case 3:
printf("local read\n");
local = 1;
read = 1;
break;
}
mask = CPU_ALLOC(nrcpus);
size = CPU_ALLOC_SIZE(nrcpus);
CPU_ZERO_S(size, mask);
node = opendir("/sys/devices/system/node/node0/");
if (!node)
perror("opendir");
while ((de = readdir(node))) {
int cpu;
if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
CPU_SET_S(cpu, size, mask);
}
closedir(node);
mask2 = CPU_ALLOC(nrcpus);
CPU_ZERO_S(size, mask2);
for (i = 0; i < size; i++)
CPU_SET_S(i, size, mask2);
CPU_XOR_S(size, mask2, mask2, mask); // invert
if (!local)
mask = mask2;
err = sched_setaffinity(0, size, mask);
if (err)
perror("sched_setaffinity");
mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
if (err)
perror("mbind");
signal(SIGALRM, sig_done);
alarm(5);
if (!read) {
while (!done) {
for (i = 0; i < SIZE; i++)
mem[i] = 0x01;
}
} else {
while (!done) {
for (i = 0; i < SIZE; i++)
t += *(volatile char *)(mem + i);
}
}
return 0;
}
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-06 06:59:25 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
perf/x86: Fix local vs remote memory events for NHM/WSM
Verified using the below proglet.. before:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,101,554 node-stores
2,096,931 node-store-misses
5.021546079 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
501,137 node-stores
199 node-store-misses
5.124451068 seconds time elapsed
After:
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write
Performance counter stats for './numa 0':
2,107,516 node-stores
2,097,187 node-store-misses
5.012755149 seconds time elapsed
[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write
Performance counter stats for './numa 1':
2,063,355 node-stores
165 node-store-misses
5.082091494 seconds time elapsed
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>
#define SIZE (32*1024*1024)
volatile int done;
void sig_done(int sig)
{
done = 1;
}
int main(int argc, char **argv)
{
cpu_set_t *mask, *mask2;
size_t size;
int i, err, t;
int nrcpus = 1024;
char *mem;
unsigned long nodemask = 0x01; /* node 0 */
DIR *node;
struct dirent *de;
int read = 0;
int local = 0;
if (argc < 2) {
printf("usage: %s [0-3]\n", argv[0]);
printf(" bit0 - local/remote\n");
printf(" bit1 - read/write\n");
exit(0);
}
switch (atoi(argv[1])) {
case 0:
printf("remote write\n");
break;
case 1:
printf("local write\n");
local = 1;
break;
case 2:
printf("remote read\n");
read = 1;
break;
case 3:
printf("local read\n");
local = 1;
read = 1;
break;
}
mask = CPU_ALLOC(nrcpus);
size = CPU_ALLOC_SIZE(nrcpus);
CPU_ZERO_S(size, mask);
node = opendir("/sys/devices/system/node/node0/");
if (!node)
perror("opendir");
while ((de = readdir(node))) {
int cpu;
if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
CPU_SET_S(cpu, size, mask);
}
closedir(node);
mask2 = CPU_ALLOC(nrcpus);
CPU_ZERO_S(size, mask2);
for (i = 0; i < size; i++)
CPU_SET_S(i, size, mask2);
CPU_XOR_S(size, mask2, mask2, mask); // invert
if (!local)
mask = mask2;
err = sched_setaffinity(0, size, mask);
if (err)
perror("sched_setaffinity");
mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
if (err)
perror("mbind");
signal(SIGALRM, sig_done);
alarm(5);
if (!read) {
while (!done) {
for (i = 0; i < SIZE; i++)
mem[i] = 0x01;
}
} else {
while (!done) {
for (i = 0; i < SIZE; i++)
t += *(volatile char *)(mem + i);
}
}
return 0;
}
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-06 06:59:25 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
|
|
|
|
[ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE,
|
2011-04-23 05:37:06 +08:00
|
|
|
},
|
|
|
|
},
|
2011-03-03 10:34:48 +08:00
|
|
|
};
|
|
|
|
|
2010-03-29 19:09:53 +08:00
|
|
|
static __initconst const u64 nehalem_hw_cache_event_ids
|
2010-02-26 19:05:05 +08:00
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
perf, x86: Update/fix Intel Nehalem cache events
Change the Nehalem cache events to use retired memory instruction counters
(similar to Westmere), this greatly improves the provided stats.
Using:
main ()
{
int i;
for (i = 0; i < 1000000000; i++) {
asm("mov (%%rsp), %%rbx;"
"mov %%rbx, (%%rsp);" : : : "rbx");
}
}
We find:
$ perf stat --repeat 10 -e instructions:u -e l1-dcache-loads:u -e l1-dcache-stores:u ./loop_1b_loads+stores
Performance counter stats for './loop_1b_loads+stores' (10 runs):
4,000,081,056 instructions:u # 0.000 IPC ( +- 0.000% )
4,999,502,846 l1-dcache-loads:u ( +- 0.008% )
1,000,034,832 l1-dcache-stores:u ( +- 0.000% )
1.565184942 seconds time elapsed ( +- 0.005% )
The 5b is surprising - we'd expect 1b:
$ perf stat --repeat 10 -e instructions:u -e r10b:u -e l1-dcache-stores:u ./loop_1b_loads+stores
Performance counter stats for './loop_1b_loads+stores' (10 runs):
4,000,081,054 instructions:u # 0.000 IPC ( +- 0.000% )
1,000,021,961 r10b:u ( +- 0.000% )
1,000,030,951 l1-dcache-stores:u ( +- 0.000% )
1.565055422 seconds time elapsed ( +- 0.003% )
Which this patch thus fixes.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Link: http://lkml.kernel.org/n/tip-q9rtru7b7840tws75xzboapv@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-04-22 19:39:56 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
perf, x86: Update/fix Intel Nehalem cache events
Change the Nehalem cache events to use retired memory instruction counters
(similar to Westmere), this greatly improves the provided stats.
Using:
main ()
{
int i;
for (i = 0; i < 1000000000; i++) {
asm("mov (%%rsp), %%rbx;"
"mov %%rbx, (%%rsp);" : : : "rbx");
}
}
We find:
$ perf stat --repeat 10 -e instructions:u -e l1-dcache-loads:u -e l1-dcache-stores:u ./loop_1b_loads+stores
Performance counter stats for './loop_1b_loads+stores' (10 runs):
4,000,081,056 instructions:u # 0.000 IPC ( +- 0.000% )
4,999,502,846 l1-dcache-loads:u ( +- 0.008% )
1,000,034,832 l1-dcache-stores:u ( +- 0.000% )
1.565184942 seconds time elapsed ( +- 0.005% )
The 5b is surprising - we'd expect 1b:
$ perf stat --repeat 10 -e instructions:u -e r10b:u -e l1-dcache-stores:u ./loop_1b_loads+stores
Performance counter stats for './loop_1b_loads+stores' (10 runs):
4,000,081,054 instructions:u # 0.000 IPC ( +- 0.000% )
1,000,021,961 r10b:u ( +- 0.000% )
1,000,030,951 l1-dcache-stores:u ( +- 0.000% )
1.565055422 seconds time elapsed ( +- 0.003% )
Which this patch thus fixes.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Link: http://lkml.kernel.org/n/tip-q9rtru7b7840tws75xzboapv@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-04-22 19:39:56 +08:00
|
|
|
[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
2011-03-03 10:34:48 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
2011-03-03 10:34:48 +08:00
|
|
|
/*
|
|
|
|
* Use RFO, not WRITEBACK, because a write miss would typically occur
|
|
|
|
* on RFO.
|
|
|
|
*/
|
2010-02-26 19:05:05 +08:00
|
|
|
[ C(OP_WRITE) ] = {
|
2011-03-03 10:34:48 +08:00
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
2011-03-03 10:34:48 +08:00
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
2010-02-26 19:05:05 +08:00
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
2011-04-23 05:37:06 +08:00
|
|
|
[ C(NODE) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
},
|
2010-02-26 19:05:05 +08:00
|
|
|
};
|
|
|
|
|
2010-03-29 19:09:53 +08:00
|
|
|
static __initconst const u64 core2_hw_cache_event_ids
|
2010-02-26 19:05:05 +08:00
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
|
|
|
|
[ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2010-03-29 19:09:53 +08:00
|
|
|
static __initconst const u64 atom_hw_cache_event_ids
|
2010-02-26 19:05:05 +08:00
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
|
|
|
|
[ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
|
|
|
|
[ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2013-07-18 17:02:24 +08:00
|
|
|
static struct extra_reg intel_slm_extra_regs[] __read_mostly =
|
|
|
|
{
|
|
|
|
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
perf/x86/intel: Fix Silvermont offcore masks
Fengguang Wu reported:
> sparse warnings: (new ones prefixed by >>)
>
> >> arch/x86/kernel/cpu/perf_event_intel.c:901:9: sparse: constant 0x768005ffff is so big it is long
> >> arch/x86/kernel/cpu/perf_event_intel.c:902:9: sparse: constant 0x768005ffff is so big it is long
>
> vim +901 arch/x86/kernel/cpu/perf_event_intel.c
>
> 895 },
> 896 };
> 897
> 898 static struct extra_reg intel_slm_extra_regs[] __read_mostly =
> 899 {
> 900 /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
> > 901 INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffff, RSP_0),
> > 902 INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffff, RSP_1),
> 903 EVENT_EXTRA_END
> 904 };
> 905
Extend those constants to 64 bits.
Reported-by: fengguang.wu@intel.com
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20130909112636.GQ31370@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-09 19:26:36 +08:00
|
|
|
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
|
2015-06-25 02:23:35 +08:00
|
|
|
INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
|
2013-07-18 17:02:24 +08:00
|
|
|
EVENT_EXTRA_END
|
|
|
|
};
|
|
|
|
|
|
|
|
#define SLM_DMND_READ SNB_DMND_DATA_RD
|
|
|
|
#define SLM_DMND_WRITE SNB_DMND_RFO
|
|
|
|
#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
|
|
|
|
|
|
|
|
#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
|
|
|
|
#define SLM_LLC_ACCESS SNB_RESP_ANY
|
|
|
|
#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 slm_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
|
2015-04-21 17:34:41 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0,
|
2013-07-18 17:02:24 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
|
|
|
|
[ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static __initconst const u64 slm_hw_cache_event_ids
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
|
|
|
{
|
|
|
|
[ C(L1D) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(L1I ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
|
|
|
|
[ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(LL ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
2015-04-21 17:34:41 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0,
|
2013-07-18 17:02:24 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x01b7,
|
|
|
|
/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
|
|
|
|
[ C(RESULT_MISS) ] = 0x01b7,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(DTLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0,
|
|
|
|
[ C(RESULT_MISS) ] = 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(ITLB) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
|
2015-04-21 17:34:41 +08:00
|
|
|
[ C(RESULT_MISS) ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */
|
2013-07-18 17:02:24 +08:00
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[ C(BPU ) ] = {
|
|
|
|
[ C(OP_READ) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
|
|
|
|
[ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
|
|
|
|
},
|
|
|
|
[ C(OP_WRITE) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
[ C(OP_PREFETCH) ] = {
|
|
|
|
[ C(RESULT_ACCESS) ] = -1,
|
|
|
|
[ C(RESULT_MISS) ] = -1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2015-12-08 06:28:18 +08:00
|
|
|
#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
|
|
|
|
#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
|
|
|
|
#define KNL_MCDRAM_LOCAL BIT_ULL(21)
|
|
|
|
#define KNL_MCDRAM_FAR BIT_ULL(22)
|
|
|
|
#define KNL_DDR_LOCAL BIT_ULL(23)
|
|
|
|
#define KNL_DDR_FAR BIT_ULL(24)
|
|
|
|
#define KNL_DRAM_ANY (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
|
|
|
|
KNL_DDR_LOCAL | KNL_DDR_FAR)
|
|
|
|
#define KNL_L2_READ SLM_DMND_READ
|
|
|
|
#define KNL_L2_WRITE SLM_DMND_WRITE
|
|
|
|
#define KNL_L2_PREFETCH SLM_DMND_PREFETCH
|
|
|
|
#define KNL_L2_ACCESS SLM_LLC_ACCESS
|
|
|
|
#define KNL_L2_MISS (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
|
|
|
|
KNL_DRAM_ANY | SNB_SNP_ANY | \
|
|
|
|
SNB_NON_DRAM)
|
|
|
|
|
|
|
|
static __initconst const u64 knl_hw_cache_extra_regs
|
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
|
|
|
|
[C(LL)] = {
|
|
|
|
[C(OP_READ)] = {
|
|
|
|
[C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = 0,
|
|
|
|
},
|
|
|
|
[C(OP_WRITE)] = {
|
|
|
|
[C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = KNL_L2_WRITE | KNL_L2_MISS,
|
|
|
|
},
|
|
|
|
[C(OP_PREFETCH)] = {
|
|
|
|
[C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
|
|
|
|
[C(RESULT_MISS)] = KNL_L2_PREFETCH | KNL_L2_MISS,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
/*
|
|
|
|
* Use from PMIs where the LBRs are already disabled.
|
|
|
|
*/
|
|
|
|
static void __intel_pmu_disable_all(void)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
|
|
|
|
|
2012-06-21 02:46:33 +08:00
|
|
|
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
|
2010-02-26 19:05:05 +08:00
|
|
|
intel_pmu_disable_bts();
|
2015-01-30 18:40:35 +08:00
|
|
|
else
|
|
|
|
intel_bts_disable_local();
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
|
|
|
intel_pmu_pebs_disable_all();
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pmu_disable_all(void)
|
|
|
|
{
|
|
|
|
__intel_pmu_disable_all();
|
2010-03-03 19:02:30 +08:00
|
|
|
intel_pmu_lbr_disable_all();
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
static void __intel_pmu_enable_all(int added, bool pmi)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2010-03-08 20:57:14 +08:00
|
|
|
intel_pmu_pebs_enable_all();
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
intel_pmu_lbr_enable_all(pmi);
|
2011-10-05 20:01:21 +08:00
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
|
|
|
|
x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2012-06-21 02:46:33 +08:00
|
|
|
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
|
2010-02-26 19:05:05 +08:00
|
|
|
struct perf_event *event =
|
2012-06-21 02:46:33 +08:00
|
|
|
cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
|
2010-02-26 19:05:05 +08:00
|
|
|
|
|
|
|
if (WARN_ON_ONCE(!event))
|
|
|
|
return;
|
|
|
|
|
|
|
|
intel_pmu_enable_bts(event->hw.config);
|
2015-01-30 18:40:35 +08:00
|
|
|
} else
|
|
|
|
intel_bts_enable_local();
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
static void intel_pmu_enable_all(int added)
|
|
|
|
{
|
|
|
|
__intel_pmu_enable_all(added, false);
|
|
|
|
}
|
|
|
|
|
2010-03-26 21:08:44 +08:00
|
|
|
/*
|
|
|
|
* Workaround for:
|
|
|
|
* Intel Errata AAK100 (model 26)
|
|
|
|
* Intel Errata AAP53 (model 30)
|
2010-03-29 22:37:17 +08:00
|
|
|
* Intel Errata BD53 (model 44)
|
2010-03-26 21:08:44 +08:00
|
|
|
*
|
2010-08-06 13:39:08 +08:00
|
|
|
* The official story:
|
|
|
|
* These chips need to be 'reset' when adding counters by programming the
|
|
|
|
* magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
|
|
|
|
* in sequence on the same PMC or on different PMCs.
|
|
|
|
*
|
|
|
|
* In practise it appears some of these events do in fact count, and
|
|
|
|
* we need to programm all 4 events.
|
2010-03-26 21:08:44 +08:00
|
|
|
*/
|
2010-08-06 13:39:08 +08:00
|
|
|
static void intel_pmu_nhm_workaround(void)
|
2010-03-26 21:08:44 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-08-06 13:39:08 +08:00
|
|
|
static const unsigned long nhm_magic[4] = {
|
|
|
|
0x4300B5,
|
|
|
|
0x4300D2,
|
|
|
|
0x4300B1,
|
|
|
|
0x4300B1
|
|
|
|
};
|
|
|
|
struct perf_event *event;
|
|
|
|
int i;
|
2010-03-26 21:08:44 +08:00
|
|
|
|
2010-08-06 13:39:08 +08:00
|
|
|
/*
|
|
|
|
* The Errata requires below steps:
|
|
|
|
* 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
|
|
|
|
* 2) Configure 4 PERFEVTSELx with the magic events and clear
|
|
|
|
* the corresponding PMCx;
|
|
|
|
* 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
|
|
|
|
* 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
|
|
|
|
* 5) Clear 4 pairs of ERFEVTSELx and PMCx;
|
|
|
|
*/
|
2010-03-26 21:08:44 +08:00
|
|
|
|
2010-08-06 13:39:08 +08:00
|
|
|
/*
|
|
|
|
* The real steps we choose are a little different from above.
|
|
|
|
* A) To reduce MSR operations, we don't run step 1) as they
|
|
|
|
* are already cleared before this function is called;
|
|
|
|
* B) Call x86_perf_event_update to save PMCx before configuring
|
|
|
|
* PERFEVTSELx with magic number;
|
|
|
|
* C) With step 5), we do clear only when the PERFEVTSELx is
|
|
|
|
* not used currently.
|
|
|
|
* D) Call x86_perf_event_set_period to restore PMCx;
|
|
|
|
*/
|
2010-03-26 21:08:44 +08:00
|
|
|
|
2010-08-06 13:39:08 +08:00
|
|
|
/* We always operate 4 pairs of PERF Counters */
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
event = cpuc->events[i];
|
|
|
|
if (event)
|
|
|
|
x86_perf_event_update(event);
|
|
|
|
}
|
2010-03-26 21:08:44 +08:00
|
|
|
|
2010-08-06 13:39:08 +08:00
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
|
|
|
|
wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
|
|
|
|
}
|
|
|
|
|
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
|
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
|
2010-03-26 21:08:44 +08:00
|
|
|
|
2010-08-06 13:39:08 +08:00
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
event = cpuc->events[i];
|
|
|
|
|
|
|
|
if (event) {
|
|
|
|
x86_perf_event_set_period(event);
|
2010-04-14 04:23:14 +08:00
|
|
|
__x86_pmu_enable_event(&event->hw,
|
2010-08-06 13:39:08 +08:00
|
|
|
ARCH_PERFMON_EVENTSEL_ENABLE);
|
|
|
|
} else
|
|
|
|
wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
|
2010-03-26 21:08:44 +08:00
|
|
|
}
|
2010-08-06 13:39:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pmu_nhm_enable_all(int added)
|
|
|
|
{
|
|
|
|
if (added)
|
|
|
|
intel_pmu_nhm_workaround();
|
2010-03-26 21:08:44 +08:00
|
|
|
intel_pmu_enable_all(added);
|
|
|
|
}
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
static inline u64 intel_pmu_get_status(void)
|
|
|
|
{
|
|
|
|
u64 status;
|
|
|
|
|
|
|
|
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
|
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void intel_pmu_ack_status(u64 ack)
|
|
|
|
{
|
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
|
|
|
|
}
|
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2012-06-21 02:46:33 +08:00
|
|
|
int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
|
2010-02-26 19:05:05 +08:00
|
|
|
u64 ctrl_val, mask;
|
|
|
|
|
|
|
|
mask = 0xfULL << (idx * 4);
|
|
|
|
|
|
|
|
rdmsrl(hwc->config_base, ctrl_val);
|
|
|
|
ctrl_val &= ~mask;
|
2010-03-08 20:51:31 +08:00
|
|
|
wrmsrl(hwc->config_base, ctrl_val);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
2013-09-12 18:53:44 +08:00
|
|
|
static inline bool event_is_checkpointed(struct perf_event *event)
|
|
|
|
{
|
|
|
|
return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
|
|
|
|
}
|
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
static void intel_pmu_disable_event(struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-03-03 03:32:08 +08:00
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-03-03 03:32:08 +08:00
|
|
|
|
2012-06-21 02:46:33 +08:00
|
|
|
if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
|
2010-02-26 19:05:05 +08:00
|
|
|
intel_pmu_disable_bts();
|
|
|
|
intel_pmu_drain_bts_buffer();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2011-10-05 20:01:21 +08:00
|
|
|
cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
|
|
|
|
cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
|
2013-09-12 18:53:44 +08:00
|
|
|
cpuc->intel_cp_status &= ~(1ull << hwc->idx);
|
2011-10-05 20:01:21 +08:00
|
|
|
|
2012-02-10 06:20:57 +08:00
|
|
|
/*
|
|
|
|
* must disable before any actual event
|
|
|
|
* because any event may be combined with LBR
|
|
|
|
*/
|
2014-11-05 10:56:06 +08:00
|
|
|
if (needs_branch_stack(event))
|
2012-02-10 06:20:57 +08:00
|
|
|
intel_pmu_lbr_disable(event);
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
|
2010-03-03 03:32:08 +08:00
|
|
|
intel_pmu_disable_fixed(hwc);
|
2010-02-26 19:05:05 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-03-03 03:32:08 +08:00
|
|
|
x86_pmu_disable_event(event);
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
2010-04-09 05:03:20 +08:00
|
|
|
if (unlikely(event->attr.precise_ip))
|
2010-03-03 20:12:23 +08:00
|
|
|
intel_pmu_pebs_disable(event);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2012-06-21 02:46:33 +08:00
|
|
|
int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
|
2010-02-26 19:05:05 +08:00
|
|
|
u64 ctrl_val, bits, mask;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Enable IRQ generation (0x8),
|
|
|
|
* and enable ring-3 counting (0x2) and ring-0 counting (0x1)
|
|
|
|
* if requested:
|
|
|
|
*/
|
|
|
|
bits = 0x8ULL;
|
|
|
|
if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
|
|
|
|
bits |= 0x2;
|
|
|
|
if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
|
|
|
|
bits |= 0x1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ANY bit is supported in v3 and up
|
|
|
|
*/
|
|
|
|
if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
|
|
|
|
bits |= 0x4;
|
|
|
|
|
|
|
|
bits <<= (idx * 4);
|
|
|
|
mask = 0xfULL << (idx * 4);
|
|
|
|
|
|
|
|
rdmsrl(hwc->config_base, ctrl_val);
|
|
|
|
ctrl_val &= ~mask;
|
|
|
|
ctrl_val |= bits;
|
2010-03-08 20:51:31 +08:00
|
|
|
wrmsrl(hwc->config_base, ctrl_val);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
2010-03-03 03:32:08 +08:00
|
|
|
static void intel_pmu_enable_event(struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-03-03 03:32:08 +08:00
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-03-03 03:32:08 +08:00
|
|
|
|
2012-06-21 02:46:33 +08:00
|
|
|
if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
|
2010-12-18 23:28:55 +08:00
|
|
|
if (!__this_cpu_read(cpu_hw_events.enabled))
|
2010-02-26 19:05:05 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
intel_pmu_enable_bts(hwc->config);
|
|
|
|
return;
|
|
|
|
}
|
2012-02-10 06:20:57 +08:00
|
|
|
/*
|
|
|
|
* must enabled before any actual event
|
|
|
|
* because any event may be combined with LBR
|
|
|
|
*/
|
2014-11-05 10:56:06 +08:00
|
|
|
if (needs_branch_stack(event))
|
2012-02-10 06:20:57 +08:00
|
|
|
intel_pmu_lbr_enable(event);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2011-10-05 20:01:21 +08:00
|
|
|
if (event->attr.exclude_host)
|
|
|
|
cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
|
|
|
|
if (event->attr.exclude_guest)
|
|
|
|
cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
|
|
|
|
|
2013-09-12 18:53:44 +08:00
|
|
|
if (unlikely(event_is_checkpointed(event)))
|
|
|
|
cpuc->intel_cp_status |= (1ull << hwc->idx);
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
|
2010-03-03 03:32:08 +08:00
|
|
|
intel_pmu_enable_fixed(hwc);
|
2010-02-26 19:05:05 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-04-09 05:03:20 +08:00
|
|
|
if (unlikely(event->attr.precise_ip))
|
2010-03-03 20:12:23 +08:00
|
|
|
intel_pmu_pebs_enable(event);
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
2010-04-14 04:23:14 +08:00
|
|
|
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Save and restart an expired event. Called by NMI contexts,
|
|
|
|
* so it has to be careful about preempting normal event ops:
|
|
|
|
*/
|
2011-08-31 07:41:05 +08:00
|
|
|
int intel_pmu_save_and_restart(struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
2010-03-03 03:18:39 +08:00
|
|
|
x86_perf_event_update(event);
|
2013-09-06 11:37:38 +08:00
|
|
|
/*
|
|
|
|
* For a checkpointed counter always reset back to 0. This
|
|
|
|
* avoids a situation where the counter overflows, aborts the
|
|
|
|
* transaction and is then set back to shortly before the
|
|
|
|
* overflow, and overflows and aborts again.
|
|
|
|
*/
|
|
|
|
if (unlikely(event_is_checkpointed(event))) {
|
|
|
|
/* No race with NMIs because the counter should not be armed */
|
|
|
|
wrmsrl(event->hw.event_base, 0);
|
|
|
|
local64_set(&event->hw.prev_count, 0);
|
|
|
|
}
|
2010-03-03 03:18:39 +08:00
|
|
|
return x86_perf_event_set_period(event);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pmu_reset(void)
|
|
|
|
{
|
2010-12-18 23:28:55 +08:00
|
|
|
struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
|
2010-02-26 19:05:05 +08:00
|
|
|
unsigned long flags;
|
|
|
|
int idx;
|
|
|
|
|
2010-03-30 00:36:50 +08:00
|
|
|
if (!x86_pmu.num_counters)
|
2010-02-26 19:05:05 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2010-03-30 00:36:50 +08:00
|
|
|
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
|
2012-06-08 04:32:04 +08:00
|
|
|
wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
|
|
|
|
wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
2010-03-30 00:36:50 +08:00
|
|
|
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
|
2012-06-08 04:32:04 +08:00
|
|
|
wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
|
2010-03-30 00:36:50 +08:00
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
if (ds)
|
|
|
|
ds->bts_index = ds->bts_buffer_base;
|
|
|
|
|
2015-02-28 01:48:30 +08:00
|
|
|
/* Ack all overflows and disable fixed counters */
|
|
|
|
if (x86_pmu.version >= 2) {
|
|
|
|
intel_pmu_ack_status(intel_pmu_get_status());
|
|
|
|
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Reset LBRs and LBR freezing */
|
|
|
|
if (x86_pmu.lbr_nr) {
|
|
|
|
update_debugctlmsr(get_debugctlmsr() &
|
|
|
|
~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
|
|
|
|
}
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This handler is triggered by the local APIC, so the APIC IRQ handling
|
|
|
|
* rules apply:
|
|
|
|
*/
|
|
|
|
static int intel_pmu_handle_irq(struct pt_regs *regs)
|
|
|
|
{
|
|
|
|
struct perf_sample_data data;
|
|
|
|
struct cpu_hw_events *cpuc;
|
|
|
|
int bit, loops;
|
2010-09-03 03:07:47 +08:00
|
|
|
u64 status;
|
2010-09-10 19:28:01 +08:00
|
|
|
int handled;
|
2010-02-26 19:05:05 +08:00
|
|
|
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2011-04-27 18:32:33 +08:00
|
|
|
/*
|
2013-06-18 08:36:50 +08:00
|
|
|
* No known reason to not always do late ACK,
|
|
|
|
* but just in case do it opt-in.
|
2011-04-27 18:32:33 +08:00
|
|
|
*/
|
2013-06-18 08:36:50 +08:00
|
|
|
if (!x86_pmu.late_ack)
|
|
|
|
apic_write(APIC_LVTPC, APIC_DM_NMI);
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
__intel_pmu_disable_all();
|
2010-09-10 19:28:01 +08:00
|
|
|
handled = intel_pmu_drain_bts_buffer();
|
2015-01-30 18:40:35 +08:00
|
|
|
handled += intel_bts_interrupt();
|
2010-02-26 19:05:05 +08:00
|
|
|
status = intel_pmu_get_status();
|
2014-02-15 08:44:08 +08:00
|
|
|
if (!status)
|
|
|
|
goto done;
|
2010-02-26 19:05:05 +08:00
|
|
|
|
|
|
|
loops = 0;
|
|
|
|
again:
|
2015-05-11 03:22:47 +08:00
|
|
|
intel_pmu_lbr_read();
|
2010-09-03 03:07:47 +08:00
|
|
|
intel_pmu_ack_status(status);
|
2010-02-26 19:05:05 +08:00
|
|
|
if (++loops > 100) {
|
2013-05-31 01:45:59 +08:00
|
|
|
static bool warned = false;
|
|
|
|
if (!warned) {
|
|
|
|
WARN(1, "perfevents: irq loop stuck!\n");
|
|
|
|
perf_event_print_debug();
|
|
|
|
warned = true;
|
|
|
|
}
|
2010-02-26 19:05:05 +08:00
|
|
|
intel_pmu_reset();
|
2010-03-08 20:51:01 +08:00
|
|
|
goto done;
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
inc_irq_stat(apic_perf_irqs);
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
2010-03-03 19:02:30 +08:00
|
|
|
|
perf/x86/intel: ignore CondChgd bit to avoid false NMI handling
Currently, any NMI is falsely handled by a NMI handler of NMI watchdog
if CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR is set.
For example, we use external NMI to make system panic to get crash
dump, but in this case, the external NMI is falsely handled do to the
issue.
This commit deals with the issue simply by ignoring CondChgd bit.
Here is explanation in detail.
On x86 NMI watchdog uses performance monitoring feature to
periodically signal NMI each time performance counter gets overflowed.
intel_pmu_handle_irq() is called as a NMI_LOCAL handler from a NMI
handler of NMI watchdog, perf_event_nmi_handler(). It identifies an
owner of a given NMI by looking at overflow status bits in
MSR_CORE_PERF_GLOBAL_STATUS MSR. If some of the bits are set, then it
handles the given NMI as its own NMI.
The problem is that the intel_pmu_handle_irq() doesn't distinguish
CondChgd bit from other bits. Unlike the other status bits, CondChgd
bit doesn't represent overflow status for performance counters. Thus,
CondChgd bit cannot be thought of as a mark indicating a given NMI is
NMI watchdog's.
As a result, if CondChgd bit is set, any NMI is falsely handled by the
NMI handler of NMI watchdog. Also, if type of the falsely handled NMI
is either NMI_UNKNOWN, NMI_SERR or NMI_IO_CHECK, the corresponding
action is never performed until CondChgd bit is cleared.
I noticed this behavior on systems with Ivy Bridge processors: Intel
Xeon CPU E5-2630 v2 and Intel Xeon CPU E7-8890 v2. On both systems,
CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR has already been set
in the beginning at boot. Then the CondChgd bit is immediately cleared
by next wrmsr to MSR_CORE_PERF_GLOBAL_CTRL MSR and appears to remain
0.
On the other hand, on older processors such as Nehalem, Xeon E7540,
CondChgd bit is not set in the beginning at boot.
I'm not sure about exact behavior of CondChgd bit, in particular when
this bit is set. Although I read Intel System Programmer's Manual to
figure out that, the descriptions I found are:
In 18.9.1:
"The MSR_PERF_GLOBAL_STATUS MSR also provides a ¡sticky bit¢ to
indicate changes to the state of performancmonitoring hardware"
In Table 35-2 IA-32 Architectural MSRs
63 CondChg: status bits of this register has changed.
These are different from the bahviour I see on the actual system as I
explained above.
At least, I think ignoring CondChgd bit should be enough for NMI
watchdog perspective.
Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20140625.103503.409316067.d.hatayama@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-06-25 09:09:07 +08:00
|
|
|
/*
|
2015-05-11 03:22:45 +08:00
|
|
|
* Ignore a range of extra bits in status that do not indicate
|
|
|
|
* overflow by themselves.
|
perf/x86/intel: ignore CondChgd bit to avoid false NMI handling
Currently, any NMI is falsely handled by a NMI handler of NMI watchdog
if CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR is set.
For example, we use external NMI to make system panic to get crash
dump, but in this case, the external NMI is falsely handled do to the
issue.
This commit deals with the issue simply by ignoring CondChgd bit.
Here is explanation in detail.
On x86 NMI watchdog uses performance monitoring feature to
periodically signal NMI each time performance counter gets overflowed.
intel_pmu_handle_irq() is called as a NMI_LOCAL handler from a NMI
handler of NMI watchdog, perf_event_nmi_handler(). It identifies an
owner of a given NMI by looking at overflow status bits in
MSR_CORE_PERF_GLOBAL_STATUS MSR. If some of the bits are set, then it
handles the given NMI as its own NMI.
The problem is that the intel_pmu_handle_irq() doesn't distinguish
CondChgd bit from other bits. Unlike the other status bits, CondChgd
bit doesn't represent overflow status for performance counters. Thus,
CondChgd bit cannot be thought of as a mark indicating a given NMI is
NMI watchdog's.
As a result, if CondChgd bit is set, any NMI is falsely handled by the
NMI handler of NMI watchdog. Also, if type of the falsely handled NMI
is either NMI_UNKNOWN, NMI_SERR or NMI_IO_CHECK, the corresponding
action is never performed until CondChgd bit is cleared.
I noticed this behavior on systems with Ivy Bridge processors: Intel
Xeon CPU E5-2630 v2 and Intel Xeon CPU E7-8890 v2. On both systems,
CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR has already been set
in the beginning at boot. Then the CondChgd bit is immediately cleared
by next wrmsr to MSR_CORE_PERF_GLOBAL_CTRL MSR and appears to remain
0.
On the other hand, on older processors such as Nehalem, Xeon E7540,
CondChgd bit is not set in the beginning at boot.
I'm not sure about exact behavior of CondChgd bit, in particular when
this bit is set. Although I read Intel System Programmer's Manual to
figure out that, the descriptions I found are:
In 18.9.1:
"The MSR_PERF_GLOBAL_STATUS MSR also provides a ¡sticky bit¢ to
indicate changes to the state of performancmonitoring hardware"
In Table 35-2 IA-32 Architectural MSRs
63 CondChg: status bits of this register has changed.
These are different from the bahviour I see on the actual system as I
explained above.
At least, I think ignoring CondChgd bit should be enough for NMI
watchdog perspective.
Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20140625.103503.409316067.d.hatayama@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-06-25 09:09:07 +08:00
|
|
|
*/
|
2015-05-11 03:22:45 +08:00
|
|
|
status &= ~(GLOBAL_STATUS_COND_CHG |
|
|
|
|
GLOBAL_STATUS_ASIF |
|
|
|
|
GLOBAL_STATUS_LBRS_FROZEN);
|
|
|
|
if (!status)
|
|
|
|
goto done;
|
perf/x86/intel: ignore CondChgd bit to avoid false NMI handling
Currently, any NMI is falsely handled by a NMI handler of NMI watchdog
if CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR is set.
For example, we use external NMI to make system panic to get crash
dump, but in this case, the external NMI is falsely handled do to the
issue.
This commit deals with the issue simply by ignoring CondChgd bit.
Here is explanation in detail.
On x86 NMI watchdog uses performance monitoring feature to
periodically signal NMI each time performance counter gets overflowed.
intel_pmu_handle_irq() is called as a NMI_LOCAL handler from a NMI
handler of NMI watchdog, perf_event_nmi_handler(). It identifies an
owner of a given NMI by looking at overflow status bits in
MSR_CORE_PERF_GLOBAL_STATUS MSR. If some of the bits are set, then it
handles the given NMI as its own NMI.
The problem is that the intel_pmu_handle_irq() doesn't distinguish
CondChgd bit from other bits. Unlike the other status bits, CondChgd
bit doesn't represent overflow status for performance counters. Thus,
CondChgd bit cannot be thought of as a mark indicating a given NMI is
NMI watchdog's.
As a result, if CondChgd bit is set, any NMI is falsely handled by the
NMI handler of NMI watchdog. Also, if type of the falsely handled NMI
is either NMI_UNKNOWN, NMI_SERR or NMI_IO_CHECK, the corresponding
action is never performed until CondChgd bit is cleared.
I noticed this behavior on systems with Ivy Bridge processors: Intel
Xeon CPU E5-2630 v2 and Intel Xeon CPU E7-8890 v2. On both systems,
CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR has already been set
in the beginning at boot. Then the CondChgd bit is immediately cleared
by next wrmsr to MSR_CORE_PERF_GLOBAL_CTRL MSR and appears to remain
0.
On the other hand, on older processors such as Nehalem, Xeon E7540,
CondChgd bit is not set in the beginning at boot.
I'm not sure about exact behavior of CondChgd bit, in particular when
this bit is set. Although I read Intel System Programmer's Manual to
figure out that, the descriptions I found are:
In 18.9.1:
"The MSR_PERF_GLOBAL_STATUS MSR also provides a ¡sticky bit¢ to
indicate changes to the state of performancmonitoring hardware"
In Table 35-2 IA-32 Architectural MSRs
63 CondChg: status bits of this register has changed.
These are different from the bahviour I see on the actual system as I
explained above.
At least, I think ignoring CondChgd bit should be enough for NMI
watchdog perspective.
Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20140625.103503.409316067.d.hatayama@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-06-25 09:09:07 +08:00
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
/*
|
|
|
|
* PEBS overflow sets bit 62 in the global status register
|
|
|
|
*/
|
2010-09-03 03:07:49 +08:00
|
|
|
if (__test_and_clear_bit(62, (unsigned long *)&status)) {
|
|
|
|
handled++;
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
x86_pmu.drain_pebs(regs);
|
2010-09-03 03:07:49 +08:00
|
|
|
}
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
2015-01-30 18:39:52 +08:00
|
|
|
/*
|
|
|
|
* Intel PT
|
|
|
|
*/
|
|
|
|
if (__test_and_clear_bit(55, (unsigned long *)&status)) {
|
|
|
|
handled++;
|
|
|
|
intel_pt_interrupt();
|
|
|
|
}
|
|
|
|
|
2013-09-06 11:37:38 +08:00
|
|
|
/*
|
2013-09-12 18:53:44 +08:00
|
|
|
* Checkpointed counters can lead to 'spurious' PMIs because the
|
|
|
|
* rollback caused by the PMI will have cleared the overflow status
|
|
|
|
* bit. Therefore always force probe these counters.
|
2013-09-06 11:37:38 +08:00
|
|
|
*/
|
2013-09-12 18:53:44 +08:00
|
|
|
status |= cpuc->intel_cp_status;
|
2013-09-06 11:37:38 +08:00
|
|
|
|
2010-03-06 05:41:37 +08:00
|
|
|
for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
|
2010-02-26 19:05:05 +08:00
|
|
|
struct perf_event *event = cpuc->events[bit];
|
|
|
|
|
2010-09-03 03:07:49 +08:00
|
|
|
handled++;
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
if (!test_bit(bit, cpuc->active_mask))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!intel_pmu_save_and_restart(event))
|
|
|
|
continue;
|
|
|
|
|
2012-04-03 02:19:08 +08:00
|
|
|
perf_sample_data_init(&data, 0, event->hw.last_period);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2012-02-10 06:20:57 +08:00
|
|
|
if (has_branch_stack(event))
|
|
|
|
data.br_stack = &cpuc->lbr_stack;
|
|
|
|
|
2011-06-27 20:41:57 +08:00
|
|
|
if (perf_event_overflow(event, &data, regs))
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
x86_pmu_stop(event, 0);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Repeat if there is more work to be done:
|
|
|
|
*/
|
|
|
|
status = intel_pmu_get_status();
|
|
|
|
if (status)
|
|
|
|
goto again;
|
|
|
|
|
2010-03-08 20:51:01 +08:00
|
|
|
done:
|
perf/x86/intel: Streamline LBR MSR handling in PMI
The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.
So there is no need to disable the LBRs explicitely in the
PMI handler.
Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
5) | /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
5) | /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
5) | /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
5) | /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
This patch:
- Avoids disabling already frozen LBRs unnecessarily in the PMI
- Avoids changing LBR_SELECT in the PMI
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-21 01:11:23 +08:00
|
|
|
__intel_pmu_enable_all(0, true);
|
2013-06-18 08:36:50 +08:00
|
|
|
/*
|
|
|
|
* Only unmask the NMI after the overflow counters
|
|
|
|
* have been reset. This avoids spurious NMIs on
|
|
|
|
* Haswell CPUs.
|
|
|
|
*/
|
|
|
|
if (x86_pmu.late_ack)
|
|
|
|
apic_write(APIC_LVTPC, APIC_DM_NMI);
|
2010-09-03 03:07:49 +08:00
|
|
|
return handled;
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct event_constraint *
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
intel_bts_constraints(struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
|
|
|
unsigned int hw_event, bts_event;
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2011-04-26 19:24:33 +08:00
|
|
|
if (event->attr.freq)
|
|
|
|
return NULL;
|
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
|
|
|
|
bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
|
2010-02-26 19:05:05 +08:00
|
|
|
return &bts_constraint;
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2015-06-25 02:23:35 +08:00
|
|
|
static int intel_alt_er(int idx, u64 config)
|
2011-05-23 17:08:15 +08:00
|
|
|
{
|
2016-01-28 06:24:29 +08:00
|
|
|
int alt_idx = idx;
|
|
|
|
|
2014-11-18 03:06:53 +08:00
|
|
|
if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
|
2012-06-05 21:30:31 +08:00
|
|
|
return idx;
|
2011-05-23 17:08:15 +08:00
|
|
|
|
2012-06-05 21:30:31 +08:00
|
|
|
if (idx == EXTRA_REG_RSP_0)
|
2015-06-25 02:23:35 +08:00
|
|
|
alt_idx = EXTRA_REG_RSP_1;
|
2012-06-05 21:30:31 +08:00
|
|
|
|
|
|
|
if (idx == EXTRA_REG_RSP_1)
|
2015-06-25 02:23:35 +08:00
|
|
|
alt_idx = EXTRA_REG_RSP_0;
|
2012-06-05 21:30:31 +08:00
|
|
|
|
2015-06-25 02:23:35 +08:00
|
|
|
if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
|
|
|
|
return idx;
|
|
|
|
|
|
|
|
return alt_idx;
|
2012-06-05 21:30:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_fixup_er(struct perf_event *event, int idx)
|
|
|
|
{
|
|
|
|
event->hw.extra_reg.idx = idx;
|
|
|
|
|
|
|
|
if (idx == EXTRA_REG_RSP_0) {
|
2011-05-23 17:08:15 +08:00
|
|
|
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
|
2013-07-18 17:02:23 +08:00
|
|
|
event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
|
2011-05-23 17:08:15 +08:00
|
|
|
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
|
2012-06-05 21:30:31 +08:00
|
|
|
} else if (idx == EXTRA_REG_RSP_1) {
|
|
|
|
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
|
2013-07-18 17:02:23 +08:00
|
|
|
event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
|
2012-06-05 21:30:31 +08:00
|
|
|
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
|
2011-05-23 17:08:15 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
/*
|
|
|
|
* manage allocation of shared extra msr for certain events
|
|
|
|
*
|
|
|
|
* sharing can be:
|
|
|
|
* per-cpu: to be shared between the various events on a single PMU
|
|
|
|
* per-core: per-cpu + shared by HT threads
|
|
|
|
*/
|
2011-03-03 10:34:47 +08:00
|
|
|
static struct event_constraint *
|
2011-06-06 22:57:03 +08:00
|
|
|
__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
|
2012-02-10 06:20:53 +08:00
|
|
|
struct perf_event *event,
|
|
|
|
struct hw_perf_event_extra *reg)
|
2011-03-03 10:34:47 +08:00
|
|
|
{
|
2011-06-06 22:57:03 +08:00
|
|
|
struct event_constraint *c = &emptyconstraint;
|
2011-03-03 10:34:47 +08:00
|
|
|
struct er_account *era;
|
2011-06-06 22:57:08 +08:00
|
|
|
unsigned long flags;
|
2012-06-05 21:30:31 +08:00
|
|
|
int idx = reg->idx;
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2012-06-05 21:30:31 +08:00
|
|
|
/*
|
|
|
|
* reg->alloc can be set due to existing state, so for fake cpuc we
|
|
|
|
* need to ignore this, otherwise we might fail to allocate proper fake
|
|
|
|
* state for this extra reg constraint. Also see the comment below.
|
|
|
|
*/
|
|
|
|
if (reg->alloc && !cpuc->is_fake)
|
2012-02-10 06:20:53 +08:00
|
|
|
return NULL; /* call x86_get_event_constraint() */
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2011-05-23 17:08:15 +08:00
|
|
|
again:
|
2012-06-05 21:30:31 +08:00
|
|
|
era = &cpuc->shared_regs->regs[idx];
|
2011-06-06 22:57:08 +08:00
|
|
|
/*
|
|
|
|
* we use spin_lock_irqsave() to avoid lockdep issues when
|
|
|
|
* passing a fake cpuc
|
|
|
|
*/
|
|
|
|
raw_spin_lock_irqsave(&era->lock, flags);
|
2011-06-06 22:57:03 +08:00
|
|
|
|
|
|
|
if (!atomic_read(&era->ref) || era->config == reg->config) {
|
|
|
|
|
2012-06-05 21:30:31 +08:00
|
|
|
/*
|
|
|
|
* If its a fake cpuc -- as per validate_{group,event}() we
|
|
|
|
* shouldn't touch event state and we can avoid doing so
|
|
|
|
* since both will only call get_event_constraints() once
|
|
|
|
* on each event, this avoids the need for reg->alloc.
|
|
|
|
*
|
|
|
|
* Not doing the ER fixup will only result in era->reg being
|
|
|
|
* wrong, but since we won't actually try and program hardware
|
|
|
|
* this isn't a problem either.
|
|
|
|
*/
|
|
|
|
if (!cpuc->is_fake) {
|
|
|
|
if (idx != reg->idx)
|
|
|
|
intel_fixup_er(event, idx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* x86_schedule_events() can call get_event_constraints()
|
|
|
|
* multiple times on events in the case of incremental
|
|
|
|
* scheduling(). reg->alloc ensures we only do the ER
|
|
|
|
* allocation once.
|
|
|
|
*/
|
|
|
|
reg->alloc = 1;
|
|
|
|
}
|
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
/* lock in msr value */
|
|
|
|
era->config = reg->config;
|
|
|
|
era->reg = reg->reg;
|
|
|
|
|
|
|
|
/* one more user */
|
|
|
|
atomic_inc(&era->ref);
|
|
|
|
|
2011-03-03 10:34:47 +08:00
|
|
|
/*
|
2012-02-10 06:20:53 +08:00
|
|
|
* need to call x86_get_event_constraint()
|
|
|
|
* to check if associated event has constraints
|
2011-03-03 10:34:47 +08:00
|
|
|
*/
|
2012-02-10 06:20:53 +08:00
|
|
|
c = NULL;
|
2012-06-05 21:30:31 +08:00
|
|
|
} else {
|
2015-06-25 02:23:35 +08:00
|
|
|
idx = intel_alt_er(idx, reg->config);
|
2012-06-05 21:30:31 +08:00
|
|
|
if (idx != reg->idx) {
|
|
|
|
raw_spin_unlock_irqrestore(&era->lock, flags);
|
|
|
|
goto again;
|
|
|
|
}
|
2011-03-03 10:34:47 +08:00
|
|
|
}
|
2011-06-06 22:57:08 +08:00
|
|
|
raw_spin_unlock_irqrestore(&era->lock, flags);
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
|
|
|
|
struct hw_perf_event_extra *reg)
|
|
|
|
{
|
|
|
|
struct er_account *era;
|
|
|
|
|
|
|
|
/*
|
2012-06-05 21:30:31 +08:00
|
|
|
* Only put constraint if extra reg was actually allocated. Also takes
|
|
|
|
* care of event which do not use an extra shared reg.
|
|
|
|
*
|
|
|
|
* Also, if this is a fake cpuc we shouldn't touch any event state
|
|
|
|
* (reg->alloc) and we don't care about leaving inconsistent cpuc state
|
|
|
|
* either since it'll be thrown out.
|
2011-06-06 22:57:03 +08:00
|
|
|
*/
|
2012-06-05 21:30:31 +08:00
|
|
|
if (!reg->alloc || cpuc->is_fake)
|
2011-06-06 22:57:03 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
era = &cpuc->shared_regs->regs[reg->idx];
|
|
|
|
|
|
|
|
/* one fewer user */
|
|
|
|
atomic_dec(&era->ref);
|
|
|
|
|
|
|
|
/* allocate again next time */
|
|
|
|
reg->alloc = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct event_constraint *
|
|
|
|
intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
2012-02-10 06:20:53 +08:00
|
|
|
struct event_constraint *c = NULL, *d;
|
|
|
|
struct hw_perf_event_extra *xreg, *breg;
|
|
|
|
|
|
|
|
xreg = &event->hw.extra_reg;
|
|
|
|
if (xreg->idx != EXTRA_REG_NONE) {
|
|
|
|
c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
|
|
|
|
if (c == &emptyconstraint)
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
breg = &event->hw.branch_reg;
|
|
|
|
if (breg->idx != EXTRA_REG_NONE) {
|
|
|
|
d = __intel_shared_reg_get_constraints(cpuc, event, breg);
|
|
|
|
if (d == &emptyconstraint) {
|
|
|
|
__intel_shared_reg_put_constraints(cpuc, xreg);
|
|
|
|
c = d;
|
|
|
|
}
|
|
|
|
}
|
2011-06-06 22:57:03 +08:00
|
|
|
return c;
|
2011-03-03 10:34:47 +08:00
|
|
|
}
|
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
struct event_constraint *
|
2014-11-18 03:06:56 +08:00
|
|
|
x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
2011-08-31 07:41:05 +08:00
|
|
|
{
|
|
|
|
struct event_constraint *c;
|
|
|
|
|
|
|
|
if (x86_pmu.event_constraints) {
|
|
|
|
for_each_event_constraint(c, x86_pmu.event_constraints) {
|
2013-01-24 23:10:27 +08:00
|
|
|
if ((event->hw.config & c->cmask) == c->code) {
|
|
|
|
event->hw.flags |= c->flags;
|
2011-08-31 07:41:05 +08:00
|
|
|
return c;
|
2013-01-24 23:10:27 +08:00
|
|
|
}
|
2011-08-31 07:41:05 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return &unconstrained;
|
|
|
|
}
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
static struct event_constraint *
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
2014-11-18 03:06:56 +08:00
|
|
|
struct perf_event *event)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
|
|
|
struct event_constraint *c;
|
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
c = intel_bts_constraints(event);
|
|
|
|
if (c)
|
|
|
|
return c;
|
|
|
|
|
2015-03-27 22:38:25 +08:00
|
|
|
c = intel_shared_regs_constraints(cpuc, event);
|
2010-02-26 19:05:05 +08:00
|
|
|
if (c)
|
|
|
|
return c;
|
|
|
|
|
2015-03-27 22:38:25 +08:00
|
|
|
c = intel_pebs_constraints(event);
|
2011-03-03 10:34:47 +08:00
|
|
|
if (c)
|
|
|
|
return c;
|
|
|
|
|
2014-11-18 03:06:56 +08:00
|
|
|
return x86_get_event_constraints(cpuc, idx, event);
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
static void
|
|
|
|
intel_start_scheduling(struct cpu_hw_events *cpuc)
|
|
|
|
{
|
|
|
|
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
|
2015-05-21 16:57:21 +08:00
|
|
|
struct intel_excl_states *xl;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
int tid = cpuc->excl_thread_id;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* nothing needed if in group validation mode
|
|
|
|
*/
|
2014-11-18 03:07:04 +08:00
|
|
|
if (cpuc->is_fake || !is_ht_workaround_enabled())
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return;
|
2014-11-18 03:07:04 +08:00
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* no exclusion needed
|
|
|
|
*/
|
2015-05-21 16:57:28 +08:00
|
|
|
if (WARN_ON_ONCE(!excl_cntrs))
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
xl = &excl_cntrs->states[tid];
|
|
|
|
|
|
|
|
xl->sched_started = true;
|
|
|
|
/*
|
|
|
|
* lock shared state until we are done scheduling
|
|
|
|
* in stop_event_scheduling()
|
|
|
|
* makes scheduling appear as a transaction
|
|
|
|
*/
|
|
|
|
raw_spin_lock(&excl_cntrs->lock);
|
|
|
|
}
|
|
|
|
|
2015-05-21 16:57:32 +08:00
|
|
|
static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
|
|
|
|
{
|
|
|
|
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
|
|
|
|
struct event_constraint *c = cpuc->event_constraint[idx];
|
|
|
|
struct intel_excl_states *xl;
|
|
|
|
int tid = cpuc->excl_thread_id;
|
|
|
|
|
|
|
|
if (cpuc->is_fake || !is_ht_workaround_enabled())
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(!excl_cntrs))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
|
|
|
|
return;
|
|
|
|
|
|
|
|
xl = &excl_cntrs->states[tid];
|
|
|
|
|
|
|
|
lockdep_assert_held(&excl_cntrs->lock);
|
|
|
|
|
2015-05-21 16:57:36 +08:00
|
|
|
if (c->flags & PERF_X86_EVENT_EXCL)
|
2015-05-21 16:57:39 +08:00
|
|
|
xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
|
2015-05-21 16:57:36 +08:00
|
|
|
else
|
2015-05-21 16:57:39 +08:00
|
|
|
xl->state[cntr] = INTEL_EXCL_SHARED;
|
2015-05-21 16:57:32 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
static void
|
|
|
|
intel_stop_scheduling(struct cpu_hw_events *cpuc)
|
|
|
|
{
|
|
|
|
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
|
2015-05-21 16:57:21 +08:00
|
|
|
struct intel_excl_states *xl;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
int tid = cpuc->excl_thread_id;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* nothing needed if in group validation mode
|
|
|
|
*/
|
2014-11-18 03:07:04 +08:00
|
|
|
if (cpuc->is_fake || !is_ht_workaround_enabled())
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return;
|
|
|
|
/*
|
|
|
|
* no exclusion needed
|
|
|
|
*/
|
2015-05-21 16:57:28 +08:00
|
|
|
if (WARN_ON_ONCE(!excl_cntrs))
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
xl = &excl_cntrs->states[tid];
|
|
|
|
|
|
|
|
xl->sched_started = false;
|
|
|
|
/*
|
|
|
|
* release shared state lock (acquired in intel_start_scheduling())
|
|
|
|
*/
|
|
|
|
raw_spin_unlock(&excl_cntrs->lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct event_constraint *
|
|
|
|
intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
|
|
|
|
int idx, struct event_constraint *c)
|
|
|
|
{
|
|
|
|
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
|
2015-05-21 16:57:21 +08:00
|
|
|
struct intel_excl_states *xlo;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
int tid = cpuc->excl_thread_id;
|
2015-05-21 16:57:21 +08:00
|
|
|
int is_excl, i;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* validating a group does not require
|
|
|
|
* enforcing cross-thread exclusion
|
|
|
|
*/
|
2014-11-18 03:07:04 +08:00
|
|
|
if (cpuc->is_fake || !is_ht_workaround_enabled())
|
|
|
|
return c;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* no exclusion needed
|
|
|
|
*/
|
2015-05-21 16:57:28 +08:00
|
|
|
if (WARN_ON_ONCE(!excl_cntrs))
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return c;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* because we modify the constraint, we need
|
|
|
|
* to make a copy. Static constraints come
|
|
|
|
* from static const tables.
|
|
|
|
*
|
|
|
|
* only needed when constraint has not yet
|
|
|
|
* been cloned (marked dynamic)
|
|
|
|
*/
|
|
|
|
if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
|
2015-05-21 16:57:24 +08:00
|
|
|
struct event_constraint *cx;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* grab pre-allocated constraint entry
|
|
|
|
*/
|
|
|
|
cx = &cpuc->constraint_list[idx];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* initialize dynamic constraint
|
|
|
|
* with static constraint
|
|
|
|
*/
|
2015-05-21 16:57:24 +08:00
|
|
|
*cx = *c;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* mark constraint as dynamic, so we
|
|
|
|
* can free it later on
|
|
|
|
*/
|
|
|
|
cx->flags |= PERF_X86_EVENT_DYNAMIC;
|
2015-05-21 16:57:24 +08:00
|
|
|
c = cx;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* From here on, the constraint is dynamic.
|
|
|
|
* Either it was just allocated above, or it
|
|
|
|
* was allocated during a earlier invocation
|
|
|
|
* of this function
|
|
|
|
*/
|
|
|
|
|
2015-05-21 16:57:21 +08:00
|
|
|
/*
|
|
|
|
* state of sibling HT
|
|
|
|
*/
|
|
|
|
xlo = &excl_cntrs->states[tid ^ 1];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* event requires exclusive counter access
|
|
|
|
* across HT threads
|
|
|
|
*/
|
|
|
|
is_excl = c->flags & PERF_X86_EVENT_EXCL;
|
|
|
|
if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
|
|
|
|
event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
|
|
|
|
if (!cpuc->n_excl++)
|
|
|
|
WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* Modify static constraint with current dynamic
|
|
|
|
* state of thread
|
|
|
|
*
|
|
|
|
* EXCLUSIVE: sibling counter measuring exclusive event
|
|
|
|
* SHARED : sibling counter measuring non-exclusive event
|
|
|
|
* UNUSED : sibling counter unused
|
|
|
|
*/
|
2015-05-21 16:57:24 +08:00
|
|
|
for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* exclusive event in sibling counter
|
|
|
|
* our corresponding counter cannot be used
|
|
|
|
* regardless of our event
|
|
|
|
*/
|
2015-05-21 16:57:21 +08:00
|
|
|
if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
|
2015-05-21 16:57:24 +08:00
|
|
|
__clear_bit(i, c->idxmsk);
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* if measuring an exclusive event, sibling
|
|
|
|
* measuring non-exclusive, then counter cannot
|
|
|
|
* be used
|
|
|
|
*/
|
2015-05-21 16:57:21 +08:00
|
|
|
if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
|
2015-05-21 16:57:24 +08:00
|
|
|
__clear_bit(i, c->idxmsk);
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* recompute actual bit weight for scheduling algorithm
|
|
|
|
*/
|
2015-05-21 16:57:24 +08:00
|
|
|
c->weight = hweight64(c->idxmsk64);
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* if we return an empty mask, then switch
|
|
|
|
* back to static empty constraint to avoid
|
|
|
|
* the cost of freeing later on
|
|
|
|
*/
|
2015-05-21 16:57:24 +08:00
|
|
|
if (c->weight == 0)
|
|
|
|
c = &emptyconstraint;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
2015-05-21 16:57:24 +08:00
|
|
|
return c;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct event_constraint *
|
|
|
|
intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
2015-09-10 17:58:27 +08:00
|
|
|
struct event_constraint *c1 = NULL;
|
2014-11-18 03:07:01 +08:00
|
|
|
struct event_constraint *c2;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
2015-09-10 17:58:27 +08:00
|
|
|
if (idx >= 0) /* fake does < 0 */
|
|
|
|
c1 = cpuc->event_constraint[idx];
|
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* first time only
|
|
|
|
* - static constraint: no change across incremental scheduling calls
|
|
|
|
* - dynamic constraint: handled by intel_get_excl_constraints()
|
|
|
|
*/
|
2014-11-18 03:07:01 +08:00
|
|
|
c2 = __intel_get_event_constraints(cpuc, idx, event);
|
|
|
|
if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
|
|
|
|
bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
|
|
|
|
c1->weight = c2->weight;
|
|
|
|
c2 = c1;
|
|
|
|
}
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
if (cpuc->excl_cntrs)
|
2014-11-18 03:07:01 +08:00
|
|
|
return intel_get_excl_constraints(cpuc, event, idx, c2);
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
2014-11-18 03:07:01 +08:00
|
|
|
return c2;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
|
|
|
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
|
|
|
|
int tid = cpuc->excl_thread_id;
|
2015-05-21 16:57:21 +08:00
|
|
|
struct intel_excl_states *xl;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* nothing needed if in group validation mode
|
|
|
|
*/
|
|
|
|
if (cpuc->is_fake)
|
|
|
|
return;
|
|
|
|
|
2015-05-21 16:57:28 +08:00
|
|
|
if (WARN_ON_ONCE(!excl_cntrs))
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return;
|
|
|
|
|
2015-05-21 16:57:17 +08:00
|
|
|
if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
|
|
|
|
hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
|
|
|
|
if (!--cpuc->n_excl)
|
|
|
|
WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
|
|
|
|
}
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
2015-05-22 17:36:13 +08:00
|
|
|
* If event was actually assigned, then mark the counter state as
|
|
|
|
* unused now.
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
*/
|
2015-05-22 17:36:13 +08:00
|
|
|
if (hwc->idx >= 0) {
|
|
|
|
xl = &excl_cntrs->states[tid];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* put_constraint may be called from x86_schedule_events()
|
|
|
|
* which already has the lock held so here make locking
|
|
|
|
* conditional.
|
|
|
|
*/
|
|
|
|
if (!xl->sched_started)
|
|
|
|
raw_spin_lock(&excl_cntrs->lock);
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
2015-05-21 16:57:21 +08:00
|
|
|
xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
2015-05-22 17:36:13 +08:00
|
|
|
if (!xl->sched_started)
|
|
|
|
raw_spin_unlock(&excl_cntrs->lock);
|
|
|
|
}
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
static void
|
|
|
|
intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
|
2011-03-03 10:34:47 +08:00
|
|
|
struct perf_event *event)
|
|
|
|
{
|
2011-06-06 22:57:03 +08:00
|
|
|
struct hw_perf_event_extra *reg;
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
reg = &event->hw.extra_reg;
|
|
|
|
if (reg->idx != EXTRA_REG_NONE)
|
|
|
|
__intel_shared_reg_put_constraints(cpuc, reg);
|
2012-02-10 06:20:53 +08:00
|
|
|
|
|
|
|
reg = &event->hw.branch_reg;
|
|
|
|
if (reg->idx != EXTRA_REG_NONE)
|
|
|
|
__intel_shared_reg_put_constraints(cpuc, reg);
|
2011-06-06 22:57:03 +08:00
|
|
|
}
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
intel_put_shared_regs_event_constraints(cpuc, event);
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* is PMU has exclusive counter restrictions, then
|
|
|
|
* all events are subject to and must call the
|
|
|
|
* put_excl_constraints() routine
|
|
|
|
*/
|
2015-05-21 16:57:13 +08:00
|
|
|
if (cpuc->excl_cntrs)
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
intel_put_excl_constraints(cpuc, event);
|
|
|
|
}
|
|
|
|
|
2012-06-05 16:26:43 +08:00
|
|
|
static void intel_pebs_aliases_core2(struct perf_event *event)
|
2010-03-30 23:00:06 +08:00
|
|
|
{
|
2012-06-05 16:26:43 +08:00
|
|
|
if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
|
2010-12-15 04:26:40 +08:00
|
|
|
/*
|
|
|
|
* Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
|
|
|
|
* (0x003c) so that we can use it with PEBS.
|
|
|
|
*
|
|
|
|
* The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
|
|
|
|
* PEBS capable. However we can use INST_RETIRED.ANY_P
|
|
|
|
* (0x00c0), which is a PEBS capable event, to get the same
|
|
|
|
* count.
|
|
|
|
*
|
|
|
|
* INST_RETIRED.ANY_P counts the number of cycles that retires
|
|
|
|
* CNTMASK instructions. By setting CNTMASK to a value (16)
|
|
|
|
* larger than the maximum number of instructions that can be
|
|
|
|
* retired per cycle (4) and then inverting the condition, we
|
|
|
|
* count all cycles that retire 16 or less instructions, which
|
|
|
|
* is every cycle.
|
|
|
|
*
|
|
|
|
* Thereby we gain a PEBS capable cycle counter.
|
|
|
|
*/
|
2012-03-12 19:44:35 +08:00
|
|
|
u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
|
|
|
|
|
2012-06-05 16:26:43 +08:00
|
|
|
alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
|
|
|
|
event->hw.config = alt_config;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pebs_aliases_snb(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
|
|
|
|
/*
|
|
|
|
* Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
|
|
|
|
* (0x003c) so that we can use it with PEBS.
|
|
|
|
*
|
|
|
|
* The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
|
|
|
|
* PEBS capable. However we can use UOPS_RETIRED.ALL
|
|
|
|
* (0x01c2), which is a PEBS capable event, to get the same
|
|
|
|
* count.
|
|
|
|
*
|
|
|
|
* UOPS_RETIRED.ALL counts the number of cycles that retires
|
|
|
|
* CNTMASK micro-ops. By setting CNTMASK to a value (16)
|
|
|
|
* larger than the maximum number of micro-ops that can be
|
|
|
|
* retired per cycle (4) and then inverting the condition, we
|
|
|
|
* count all cycles that retire 16 or less micro-ops, which
|
|
|
|
* is every cycle.
|
|
|
|
*
|
|
|
|
* Thereby we gain a PEBS capable cycle counter.
|
|
|
|
*/
|
|
|
|
u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
|
2010-12-15 04:26:40 +08:00
|
|
|
|
|
|
|
alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
|
|
|
|
event->hw.config = alt_config;
|
|
|
|
}
|
2012-06-05 16:26:43 +08:00
|
|
|
}
|
|
|
|
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
static void intel_pebs_aliases_precdist(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
|
|
|
|
/*
|
|
|
|
* Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
|
|
|
|
* (0x003c) so that we can use it with PEBS.
|
|
|
|
*
|
|
|
|
* The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
|
|
|
|
* PEBS capable. However we can use INST_RETIRED.PREC_DIST
|
|
|
|
* (0x01c0), which is a PEBS capable event, to get the same
|
|
|
|
* count.
|
|
|
|
*
|
|
|
|
* The PREC_DIST event has special support to minimize sample
|
|
|
|
* shadowing effects. One drawback is that it can be
|
|
|
|
* only programmed on counter 1, but that seems like an
|
|
|
|
* acceptable trade off.
|
|
|
|
*/
|
|
|
|
u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
|
|
|
|
|
|
|
|
alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
|
|
|
|
event->hw.config = alt_config;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pebs_aliases_ivb(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (event->attr.precise_ip < 3)
|
|
|
|
return intel_pebs_aliases_snb(event);
|
|
|
|
return intel_pebs_aliases_precdist(event);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_pebs_aliases_skl(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (event->attr.precise_ip < 3)
|
|
|
|
return intel_pebs_aliases_core2(event);
|
|
|
|
return intel_pebs_aliases_precdist(event);
|
|
|
|
}
|
|
|
|
|
2015-05-28 12:13:14 +08:00
|
|
|
static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
|
|
|
|
{
|
|
|
|
unsigned long flags = x86_pmu.free_running_flags;
|
|
|
|
|
|
|
|
if (event->attr.use_clockid)
|
|
|
|
flags &= ~PERF_SAMPLE_TIME;
|
|
|
|
return flags;
|
|
|
|
}
|
|
|
|
|
2012-06-05 16:26:43 +08:00
|
|
|
static int intel_pmu_hw_config(struct perf_event *event)
|
|
|
|
{
|
|
|
|
int ret = x86_pmu_hw_config(event);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2015-05-07 03:33:47 +08:00
|
|
|
if (event->attr.precise_ip) {
|
perf/x86/intel: Implement batched PEBS interrupt handling (large PEBS interrupt threshold)
PEBS always had the capability to log samples to its buffers without
an interrupt. Traditionally perf has not used this but always set the
PEBS threshold to one.
For frequently occurring events (like cycles or branches or load/store)
this in term requires using a relatively high sampling period to avoid
overloading the system, by only processing PMIs. This in term increases
sampling error.
For the common cases we still need to use the PMI because the PEBS
hardware has various limitations. The biggest one is that it can not
supply a callgraph. It also requires setting a fixed period, as the
hardware does not support adaptive period. Another issue is that it
cannot supply a time stamp and some other options. To supply a TID it
requires flushing on context switch. It can however supply the IP, the
load/store address, TSX information, registers, and some other things.
So we can make PEBS work for some specific cases, basically as long as
you can do without a callgraph and can set the period you can use this
new PEBS mode.
The main benefit is the ability to support much lower sampling period
(down to -c 1000) without extensive overhead.
One use cases is for example to increase the resolution of the c2c tool.
Another is double checking when you suspect the standard sampling has
too much sampling error.
Some numbers on the overhead, using cycle soak, comparing the elapsed
time from "kernbench -M -H" between plain (threshold set to one) and
multi (large threshold).
The test command for plain:
"perf record --time -e cycles:p -c $period -- kernbench -M -H"
The test command for multi:
"perf record --no-time -e cycles:p -c $period -- kernbench -M -H"
( The only difference of test command between multi and plain is time
stamp options. Since time stamp is not supported by large PEBS
threshold, it can be used as a flag to indicate if large threshold is
enabled during the test. )
period plain(Sec) multi(Sec) Delta
10003 32.7 16.5 16.2
20003 30.2 16.2 14.0
40003 18.6 14.1 4.5
80003 16.8 14.6 2.2
100003 16.9 14.1 2.8
800003 15.4 15.7 -0.3
1000003 15.3 15.2 0.2
2000003 15.3 15.1 0.1
With periods below 100003, plain (threshold one) cause much more
overhead. With 10003 sampling period, the Elapsed Time for multi is
even 2X faster than plain.
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1430940834-8964-5-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-07 03:33:50 +08:00
|
|
|
if (!event->attr.freq) {
|
2015-05-07 03:33:47 +08:00
|
|
|
event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
|
2015-05-28 12:13:14 +08:00
|
|
|
if (!(event->attr.sample_type &
|
|
|
|
~intel_pmu_free_running_flags(event)))
|
perf/x86/intel: Implement batched PEBS interrupt handling (large PEBS interrupt threshold)
PEBS always had the capability to log samples to its buffers without
an interrupt. Traditionally perf has not used this but always set the
PEBS threshold to one.
For frequently occurring events (like cycles or branches or load/store)
this in term requires using a relatively high sampling period to avoid
overloading the system, by only processing PMIs. This in term increases
sampling error.
For the common cases we still need to use the PMI because the PEBS
hardware has various limitations. The biggest one is that it can not
supply a callgraph. It also requires setting a fixed period, as the
hardware does not support adaptive period. Another issue is that it
cannot supply a time stamp and some other options. To supply a TID it
requires flushing on context switch. It can however supply the IP, the
load/store address, TSX information, registers, and some other things.
So we can make PEBS work for some specific cases, basically as long as
you can do without a callgraph and can set the period you can use this
new PEBS mode.
The main benefit is the ability to support much lower sampling period
(down to -c 1000) without extensive overhead.
One use cases is for example to increase the resolution of the c2c tool.
Another is double checking when you suspect the standard sampling has
too much sampling error.
Some numbers on the overhead, using cycle soak, comparing the elapsed
time from "kernbench -M -H" between plain (threshold set to one) and
multi (large threshold).
The test command for plain:
"perf record --time -e cycles:p -c $period -- kernbench -M -H"
The test command for multi:
"perf record --no-time -e cycles:p -c $period -- kernbench -M -H"
( The only difference of test command between multi and plain is time
stamp options. Since time stamp is not supported by large PEBS
threshold, it can be used as a flag to indicate if large threshold is
enabled during the test. )
period plain(Sec) multi(Sec) Delta
10003 32.7 16.5 16.2
20003 30.2 16.2 14.0
40003 18.6 14.1 4.5
80003 16.8 14.6 2.2
100003 16.9 14.1 2.8
800003 15.4 15.7 -0.3
1000003 15.3 15.2 0.2
2000003 15.3 15.1 0.1
With periods below 100003, plain (threshold one) cause much more
overhead. With 10003 sampling period, the Elapsed Time for multi is
even 2X faster than plain.
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1430940834-8964-5-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-07 03:33:50 +08:00
|
|
|
event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
|
|
|
|
}
|
2015-05-07 03:33:47 +08:00
|
|
|
if (x86_pmu.pebs_aliases)
|
|
|
|
x86_pmu.pebs_aliases(event);
|
|
|
|
}
|
2010-12-15 04:26:40 +08:00
|
|
|
|
2014-11-05 10:56:06 +08:00
|
|
|
if (needs_branch_stack(event)) {
|
2012-02-10 06:20:57 +08:00
|
|
|
ret = intel_pmu_setup_lbr_filter(event);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2015-01-14 20:18:20 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* BTS is set up earlier in this path, so don't account twice
|
|
|
|
*/
|
|
|
|
if (!intel_pmu_has_bts(event)) {
|
|
|
|
/* disallow lbr if conflicting events are present */
|
|
|
|
if (x86_add_exclusive(x86_lbr_exclusive_lbr))
|
|
|
|
return -EBUSY;
|
|
|
|
|
|
|
|
event->destroy = hw_perf_lbr_event_destroy;
|
|
|
|
}
|
2012-02-10 06:20:57 +08:00
|
|
|
}
|
|
|
|
|
2010-03-30 23:00:06 +08:00
|
|
|
if (event->attr.type != PERF_TYPE_RAW)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (x86_pmu.version < 3)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
|
|
|
|
return -EACCES;
|
|
|
|
|
|
|
|
event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-10-05 20:01:21 +08:00
|
|
|
struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
|
|
|
|
{
|
|
|
|
if (x86_pmu.guest_get_msrs)
|
|
|
|
return x86_pmu.guest_get_msrs(nr);
|
|
|
|
*nr = 0;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
|
|
|
|
|
|
|
|
static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
|
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2011-10-05 20:01:21 +08:00
|
|
|
struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
|
|
|
|
|
|
|
|
arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
|
|
|
|
arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
|
|
|
|
arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
|
2012-08-09 16:52:34 +08:00
|
|
|
/*
|
|
|
|
* If PMU counter has PEBS enabled it is not enough to disable counter
|
|
|
|
* on a guest entry since PEBS memory write can overshoot guest entry
|
|
|
|
* and corrupt guest memory. Disabling PEBS solves the problem.
|
|
|
|
*/
|
|
|
|
arr[1].msr = MSR_IA32_PEBS_ENABLE;
|
|
|
|
arr[1].host = cpuc->pebs_enabled;
|
|
|
|
arr[1].guest = 0;
|
2011-10-05 20:01:21 +08:00
|
|
|
|
2012-08-09 16:52:34 +08:00
|
|
|
*nr = 2;
|
2011-10-05 20:01:21 +08:00
|
|
|
return arr;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
|
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2011-10-05 20:01:21 +08:00
|
|
|
struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
|
|
|
|
int idx;
|
|
|
|
|
|
|
|
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
|
|
|
|
struct perf_event *event = cpuc->events[idx];
|
|
|
|
|
|
|
|
arr[idx].msr = x86_pmu_config_addr(idx);
|
|
|
|
arr[idx].host = arr[idx].guest = 0;
|
|
|
|
|
|
|
|
if (!test_bit(idx, cpuc->active_mask))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
arr[idx].host = arr[idx].guest =
|
|
|
|
event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
|
|
|
|
|
|
|
|
if (event->attr.exclude_host)
|
|
|
|
arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
|
|
|
|
else if (event->attr.exclude_guest)
|
|
|
|
arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
|
|
|
|
}
|
|
|
|
|
|
|
|
*nr = x86_pmu.num_counters;
|
|
|
|
return arr;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void core_pmu_enable_event(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (!event->attr.exclude_host)
|
|
|
|
x86_pmu_enable_event(event);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void core_pmu_enable_all(int added)
|
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2011-10-05 20:01:21 +08:00
|
|
|
int idx;
|
|
|
|
|
|
|
|
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
|
|
|
|
struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
|
|
|
|
|
|
|
|
if (!test_bit(idx, cpuc->active_mask) ||
|
|
|
|
cpuc->events[idx]->attr.exclude_host)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-06-18 08:36:48 +08:00
|
|
|
static int hsw_hw_config(struct perf_event *event)
|
|
|
|
{
|
|
|
|
int ret = intel_pmu_hw_config(event);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
|
|
|
|
return 0;
|
|
|
|
event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
|
|
|
|
* PEBS or in ANY thread mode. Since the results are non-sensical forbid
|
|
|
|
* this combination.
|
|
|
|
*/
|
|
|
|
if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
|
|
|
|
((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
|
|
|
|
event->attr.precise_ip > 0))
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
2013-09-06 11:37:38 +08:00
|
|
|
if (event_is_checkpointed(event)) {
|
|
|
|
/*
|
|
|
|
* Sampling of checkpointed events can cause situations where
|
|
|
|
* the CPU constantly aborts because of a overflow, which is
|
|
|
|
* then checkpointed back and ignored. Forbid checkpointing
|
|
|
|
* for sampling.
|
|
|
|
*
|
|
|
|
* But still allow a long sampling period, so that perf stat
|
|
|
|
* from KVM works.
|
|
|
|
*/
|
|
|
|
if (event->attr.sample_period > 0 &&
|
|
|
|
event->attr.sample_period < 0x7fffffff)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
2013-06-18 08:36:48 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct event_constraint counter2_constraint =
|
|
|
|
EVENT_CONSTRAINT(0, 0x4, 0);
|
|
|
|
|
|
|
|
static struct event_constraint *
|
2014-11-18 03:06:56 +08:00
|
|
|
hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
|
|
|
struct perf_event *event)
|
2013-06-18 08:36:48 +08:00
|
|
|
{
|
2014-11-18 03:06:56 +08:00
|
|
|
struct event_constraint *c;
|
|
|
|
|
|
|
|
c = intel_get_event_constraints(cpuc, idx, event);
|
2013-06-18 08:36:48 +08:00
|
|
|
|
|
|
|
/* Handle special quirk on in_tx_checkpointed only in counter 2 */
|
|
|
|
if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
|
|
|
|
if (c->idxmsk64 & (1U << 2))
|
|
|
|
return &counter2_constraint;
|
|
|
|
return &emptyconstraint;
|
|
|
|
}
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Add INST_RETIRED.ALL workarounds
On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.
This is erratum BDM11 and BDM55:
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.
Add a new callback to enforce this, and set it for Broadwell.
How does this handle the case when an app requests a specific
period with some of the bottom bits set?
Short answer:
Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.
Long answer (by Peterz):
IFF we guarantee perf_event_attr::sample_period >= 128.
Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.
So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.
So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.
So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-02-18 10:18:06 +08:00
|
|
|
/*
|
|
|
|
* Broadwell:
|
|
|
|
*
|
|
|
|
* The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
|
|
|
|
* (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
|
|
|
|
* the two to enforce a minimum period of 128 (the smallest value that has bits
|
|
|
|
* 0-5 cleared and >= 100).
|
|
|
|
*
|
|
|
|
* Because of how the code in x86_perf_event_set_period() works, the truncation
|
|
|
|
* of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
|
|
|
|
* to make up for the 'lost' events due to carrying the 'error' in period_left.
|
|
|
|
*
|
|
|
|
* Therefore the effective (average) period matches the requested period,
|
|
|
|
* despite coarser hardware granularity.
|
|
|
|
*/
|
|
|
|
static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
|
|
|
|
{
|
|
|
|
if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
|
|
|
|
X86_CONFIG(.event=0xc0, .umask=0x01)) {
|
|
|
|
if (left < 128)
|
|
|
|
left = 128;
|
|
|
|
left &= ~0x3fu;
|
|
|
|
}
|
|
|
|
return left;
|
|
|
|
}
|
|
|
|
|
2012-03-16 03:09:14 +08:00
|
|
|
PMU_FORMAT_ATTR(event, "config:0-7" );
|
|
|
|
PMU_FORMAT_ATTR(umask, "config:8-15" );
|
|
|
|
PMU_FORMAT_ATTR(edge, "config:18" );
|
|
|
|
PMU_FORMAT_ATTR(pc, "config:19" );
|
|
|
|
PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */
|
|
|
|
PMU_FORMAT_ATTR(inv, "config:23" );
|
|
|
|
PMU_FORMAT_ATTR(cmask, "config:24-31" );
|
2013-06-18 08:36:48 +08:00
|
|
|
PMU_FORMAT_ATTR(in_tx, "config:32");
|
|
|
|
PMU_FORMAT_ATTR(in_tx_cp, "config:33");
|
2012-03-16 03:09:14 +08:00
|
|
|
|
|
|
|
static struct attribute *intel_arch_formats_attr[] = {
|
|
|
|
&format_attr_event.attr,
|
|
|
|
&format_attr_umask.attr,
|
|
|
|
&format_attr_edge.attr,
|
|
|
|
&format_attr_pc.attr,
|
|
|
|
&format_attr_inv.attr,
|
|
|
|
&format_attr_cmask.attr,
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2012-10-10 20:53:14 +08:00
|
|
|
ssize_t intel_event_sysfs_show(char *page, u64 config)
|
|
|
|
{
|
|
|
|
u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
|
|
|
|
|
|
|
|
return x86_event_sysfs_show(page, config, event);
|
|
|
|
}
|
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
struct intel_shared_regs *allocate_shared_regs(int cpu)
|
2011-06-06 22:57:03 +08:00
|
|
|
{
|
|
|
|
struct intel_shared_regs *regs;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
regs = kzalloc_node(sizeof(struct intel_shared_regs),
|
|
|
|
GFP_KERNEL, cpu_to_node(cpu));
|
|
|
|
if (regs) {
|
|
|
|
/*
|
|
|
|
* initialize the locks to keep lockdep happy
|
|
|
|
*/
|
|
|
|
for (i = 0; i < EXTRA_REG_MAX; i++)
|
|
|
|
raw_spin_lock_init(®s->regs[i].lock);
|
|
|
|
|
|
|
|
regs->core_id = -1;
|
|
|
|
}
|
|
|
|
return regs;
|
|
|
|
}
|
|
|
|
|
2014-11-18 03:06:57 +08:00
|
|
|
static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
|
|
|
|
{
|
|
|
|
struct intel_excl_cntrs *c;
|
|
|
|
|
|
|
|
c = kzalloc_node(sizeof(struct intel_excl_cntrs),
|
|
|
|
GFP_KERNEL, cpu_to_node(cpu));
|
|
|
|
if (c) {
|
|
|
|
raw_spin_lock_init(&c->lock);
|
|
|
|
c->core_id = -1;
|
|
|
|
}
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2011-03-03 10:34:47 +08:00
|
|
|
static int intel_pmu_cpu_prepare(int cpu)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
|
|
|
|
|
2014-11-18 03:06:57 +08:00
|
|
|
if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
|
|
|
|
cpuc->shared_regs = allocate_shared_regs(cpu);
|
|
|
|
if (!cpuc->shared_regs)
|
2015-08-10 20:17:34 +08:00
|
|
|
goto err;
|
2014-11-18 03:06:57 +08:00
|
|
|
}
|
2011-03-03 10:34:50 +08:00
|
|
|
|
2014-11-18 03:06:57 +08:00
|
|
|
if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
|
|
|
|
size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
|
|
|
|
|
|
|
|
cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
|
|
|
|
if (!cpuc->constraint_list)
|
2015-08-10 20:17:34 +08:00
|
|
|
goto err_shared_regs;
|
2014-11-18 03:06:57 +08:00
|
|
|
|
|
|
|
cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
|
2015-08-10 20:17:34 +08:00
|
|
|
if (!cpuc->excl_cntrs)
|
|
|
|
goto err_constraint_list;
|
|
|
|
|
2014-11-18 03:06:57 +08:00
|
|
|
cpuc->excl_thread_id = 0;
|
|
|
|
}
|
2011-03-03 10:34:47 +08:00
|
|
|
|
|
|
|
return NOTIFY_OK;
|
2015-08-10 20:17:34 +08:00
|
|
|
|
|
|
|
err_constraint_list:
|
|
|
|
kfree(cpuc->constraint_list);
|
|
|
|
cpuc->constraint_list = NULL;
|
|
|
|
|
|
|
|
err_shared_regs:
|
|
|
|
kfree(cpuc->shared_regs);
|
|
|
|
cpuc->shared_regs = NULL;
|
|
|
|
|
|
|
|
err:
|
|
|
|
return NOTIFY_BAD;
|
2011-03-03 10:34:47 +08:00
|
|
|
}
|
|
|
|
|
2010-03-05 20:49:35 +08:00
|
|
|
static void intel_pmu_cpu_starting(int cpu)
|
|
|
|
{
|
2011-03-03 10:34:47 +08:00
|
|
|
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
|
|
|
|
int core_id = topology_core_id(cpu);
|
|
|
|
int i;
|
|
|
|
|
2011-03-03 10:34:50 +08:00
|
|
|
init_debug_store_on_cpu(cpu);
|
|
|
|
/*
|
|
|
|
* Deal with CPUs that don't clear their LBRs on power-up.
|
|
|
|
*/
|
|
|
|
intel_pmu_lbr_reset();
|
|
|
|
|
2012-02-10 06:20:53 +08:00
|
|
|
cpuc->lbr_sel = NULL;
|
|
|
|
|
|
|
|
if (!cpuc->shared_regs)
|
2011-03-03 10:34:50 +08:00
|
|
|
return;
|
|
|
|
|
2014-11-18 03:06:53 +08:00
|
|
|
if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
|
2015-05-26 21:11:28 +08:00
|
|
|
for_each_cpu(i, topology_sibling_cpumask(cpu)) {
|
2012-02-10 06:20:53 +08:00
|
|
|
struct intel_shared_regs *pc;
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2012-02-10 06:20:53 +08:00
|
|
|
pc = per_cpu(cpu_hw_events, i).shared_regs;
|
|
|
|
if (pc && pc->core_id == core_id) {
|
2016-01-28 06:31:09 +08:00
|
|
|
cpuc->kfree_on_online[0] = cpuc->shared_regs;
|
2012-02-10 06:20:53 +08:00
|
|
|
cpuc->shared_regs = pc;
|
|
|
|
break;
|
|
|
|
}
|
2011-03-03 10:34:47 +08:00
|
|
|
}
|
2012-02-10 06:20:53 +08:00
|
|
|
cpuc->shared_regs->core_id = core_id;
|
|
|
|
cpuc->shared_regs->refcnt++;
|
2011-03-03 10:34:47 +08:00
|
|
|
}
|
|
|
|
|
2012-02-10 06:20:53 +08:00
|
|
|
if (x86_pmu.lbr_sel_map)
|
|
|
|
cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
|
2014-11-18 03:06:57 +08:00
|
|
|
|
|
|
|
if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
|
2015-05-26 21:11:28 +08:00
|
|
|
for_each_cpu(i, topology_sibling_cpumask(cpu)) {
|
2014-11-18 03:06:57 +08:00
|
|
|
struct intel_excl_cntrs *c;
|
|
|
|
|
|
|
|
c = per_cpu(cpu_hw_events, i).excl_cntrs;
|
|
|
|
if (c && c->core_id == core_id) {
|
|
|
|
cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
|
|
|
|
cpuc->excl_cntrs = c;
|
|
|
|
cpuc->excl_thread_id = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
cpuc->excl_cntrs->core_id = core_id;
|
|
|
|
cpuc->excl_cntrs->refcnt++;
|
|
|
|
}
|
2010-03-05 20:49:35 +08:00
|
|
|
}
|
|
|
|
|
2014-11-18 03:07:04 +08:00
|
|
|
static void free_excl_cntrs(int cpu)
|
2010-03-05 20:49:35 +08:00
|
|
|
{
|
2011-03-03 10:34:47 +08:00
|
|
|
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
|
2014-11-18 03:06:57 +08:00
|
|
|
struct intel_excl_cntrs *c;
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2014-11-18 03:06:57 +08:00
|
|
|
c = cpuc->excl_cntrs;
|
|
|
|
if (c) {
|
|
|
|
if (c->core_id == -1 || --c->refcnt == 0)
|
|
|
|
kfree(c);
|
|
|
|
cpuc->excl_cntrs = NULL;
|
|
|
|
kfree(cpuc->constraint_list);
|
|
|
|
cpuc->constraint_list = NULL;
|
|
|
|
}
|
2014-11-18 03:07:04 +08:00
|
|
|
}
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2014-11-18 03:07:04 +08:00
|
|
|
static void intel_pmu_cpu_dying(int cpu)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
|
|
|
|
struct intel_shared_regs *pc;
|
|
|
|
|
|
|
|
pc = cpuc->shared_regs;
|
|
|
|
if (pc) {
|
|
|
|
if (pc->core_id == -1 || --pc->refcnt == 0)
|
|
|
|
kfree(pc);
|
|
|
|
cpuc->shared_regs = NULL;
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
}
|
|
|
|
|
2014-11-18 03:07:04 +08:00
|
|
|
free_excl_cntrs(cpu);
|
|
|
|
|
2010-03-05 20:49:35 +08:00
|
|
|
fini_debug_store_on_cpu(cpu);
|
|
|
|
}
|
|
|
|
|
2015-05-07 03:33:51 +08:00
|
|
|
static void intel_pmu_sched_task(struct perf_event_context *ctx,
|
|
|
|
bool sched_in)
|
|
|
|
{
|
|
|
|
if (x86_pmu.pebs_active)
|
|
|
|
intel_pmu_pebs_sched_task(ctx, sched_in);
|
|
|
|
if (x86_pmu.lbr_nr)
|
|
|
|
intel_pmu_lbr_sched_task(ctx, sched_in);
|
|
|
|
}
|
|
|
|
|
2012-03-16 03:09:14 +08:00
|
|
|
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
|
|
|
|
|
2013-01-24 23:10:33 +08:00
|
|
|
PMU_FORMAT_ATTR(ldlat, "config1:0-15");
|
|
|
|
|
2015-09-10 05:53:59 +08:00
|
|
|
PMU_FORMAT_ATTR(frontend, "config1:0-23");
|
|
|
|
|
2012-03-16 03:09:14 +08:00
|
|
|
static struct attribute *intel_arch3_formats_attr[] = {
|
|
|
|
&format_attr_event.attr,
|
|
|
|
&format_attr_umask.attr,
|
|
|
|
&format_attr_edge.attr,
|
|
|
|
&format_attr_pc.attr,
|
|
|
|
&format_attr_any.attr,
|
|
|
|
&format_attr_inv.attr,
|
|
|
|
&format_attr_cmask.attr,
|
2013-06-18 08:36:48 +08:00
|
|
|
&format_attr_in_tx.attr,
|
|
|
|
&format_attr_in_tx_cp.attr,
|
2012-03-16 03:09:14 +08:00
|
|
|
|
|
|
|
&format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
|
2013-01-24 23:10:33 +08:00
|
|
|
&format_attr_ldlat.attr, /* PEBS load latency */
|
2012-03-16 03:09:14 +08:00
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2015-09-10 05:53:59 +08:00
|
|
|
static struct attribute *skl_format_attr[] = {
|
|
|
|
&format_attr_frontend.attr,
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2015-04-21 23:26:23 +08:00
|
|
|
static __initconst const struct x86_pmu core_pmu = {
|
|
|
|
.name = "core",
|
|
|
|
.handle_irq = x86_pmu_handle_irq,
|
|
|
|
.disable_all = x86_pmu_disable_all,
|
|
|
|
.enable_all = core_pmu_enable_all,
|
|
|
|
.enable = core_pmu_enable_event,
|
|
|
|
.disable = x86_pmu_disable_event,
|
|
|
|
.hw_config = x86_pmu_hw_config,
|
|
|
|
.schedule_events = x86_schedule_events,
|
|
|
|
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
|
|
|
|
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
|
|
|
|
.event_map = intel_pmu_event_map,
|
|
|
|
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
|
|
|
|
.apic = 1,
|
2015-05-28 12:13:14 +08:00
|
|
|
.free_running_flags = PEBS_FREERUNNING_FLAGS,
|
|
|
|
|
2015-04-21 23:26:23 +08:00
|
|
|
/*
|
|
|
|
* Intel PMCs cannot be accessed sanely above 32-bit width,
|
|
|
|
* so we install an artificial 1<<31 period regardless of
|
|
|
|
* the generic event period:
|
|
|
|
*/
|
|
|
|
.max_period = (1ULL<<31) - 1,
|
|
|
|
.get_event_constraints = intel_get_event_constraints,
|
|
|
|
.put_event_constraints = intel_put_event_constraints,
|
|
|
|
.event_constraints = intel_core_event_constraints,
|
|
|
|
.guest_get_msrs = core_guest_get_msrs,
|
|
|
|
.format_attrs = intel_arch_formats_attr,
|
|
|
|
.events_sysfs_show = intel_event_sysfs_show,
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Virtual (or funny metal) CPU can define x86_pmu.extra_regs
|
|
|
|
* together with PMU version 1 and thus be using core_pmu with
|
|
|
|
* shared_regs. We need following callbacks here to allocate
|
|
|
|
* it properly.
|
|
|
|
*/
|
|
|
|
.cpu_prepare = intel_pmu_cpu_prepare,
|
|
|
|
.cpu_starting = intel_pmu_cpu_starting,
|
|
|
|
.cpu_dying = intel_pmu_cpu_dying,
|
|
|
|
};
|
|
|
|
|
2010-03-29 19:09:53 +08:00
|
|
|
static __initconst const struct x86_pmu intel_pmu = {
|
2010-02-26 19:05:05 +08:00
|
|
|
.name = "Intel",
|
|
|
|
.handle_irq = intel_pmu_handle_irq,
|
|
|
|
.disable_all = intel_pmu_disable_all,
|
|
|
|
.enable_all = intel_pmu_enable_all,
|
|
|
|
.enable = intel_pmu_enable_event,
|
|
|
|
.disable = intel_pmu_disable_event,
|
2010-03-30 23:00:06 +08:00
|
|
|
.hw_config = intel_pmu_hw_config,
|
2010-03-12 00:54:39 +08:00
|
|
|
.schedule_events = x86_schedule_events,
|
2010-02-26 19:05:05 +08:00
|
|
|
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
|
|
|
|
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
|
|
|
|
.event_map = intel_pmu_event_map,
|
|
|
|
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
|
|
|
|
.apic = 1,
|
2015-05-28 12:13:14 +08:00
|
|
|
.free_running_flags = PEBS_FREERUNNING_FLAGS,
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
|
|
|
* Intel PMCs cannot be accessed sanely above 32 bit width,
|
|
|
|
* so we install an artificial 1<<31 period regardless of
|
|
|
|
* the generic event period:
|
|
|
|
*/
|
|
|
|
.max_period = (1ULL << 31) - 1,
|
2010-03-05 20:01:18 +08:00
|
|
|
.get_event_constraints = intel_get_event_constraints,
|
2011-03-03 10:34:47 +08:00
|
|
|
.put_event_constraints = intel_put_event_constraints,
|
2012-06-05 16:26:43 +08:00
|
|
|
.pebs_aliases = intel_pebs_aliases_core2,
|
2010-03-05 20:01:18 +08:00
|
|
|
|
2012-03-16 03:09:14 +08:00
|
|
|
.format_attrs = intel_arch3_formats_attr,
|
2012-10-10 20:53:14 +08:00
|
|
|
.events_sysfs_show = intel_event_sysfs_show,
|
2012-03-16 03:09:14 +08:00
|
|
|
|
2011-03-03 10:34:47 +08:00
|
|
|
.cpu_prepare = intel_pmu_cpu_prepare,
|
2010-03-05 20:49:35 +08:00
|
|
|
.cpu_starting = intel_pmu_cpu_starting,
|
|
|
|
.cpu_dying = intel_pmu_cpu_dying,
|
2011-10-05 20:01:21 +08:00
|
|
|
.guest_get_msrs = intel_guest_get_msrs,
|
2015-05-07 03:33:51 +08:00
|
|
|
.sched_task = intel_pmu_sched_task,
|
2010-02-26 19:05:05 +08:00
|
|
|
};
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
static __init void intel_clovertown_quirk(void)
|
2010-03-05 04:49:01 +08:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* PEBS is unreliable due to:
|
|
|
|
*
|
|
|
|
* AJ67 - PEBS may experience CPL leaks
|
|
|
|
* AJ68 - PEBS PMI may be delayed by one event
|
|
|
|
* AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
|
|
|
|
* AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
|
|
|
|
*
|
|
|
|
* AJ67 could be worked around by restricting the OS/USR flags.
|
|
|
|
* AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
|
|
|
|
*
|
|
|
|
* AJ106 could possibly be worked around by not allowing LBR
|
|
|
|
* usage from PEBS, including the fixup.
|
|
|
|
* AJ68 could possibly be worked around by always programming
|
2011-04-27 17:51:41 +08:00
|
|
|
* a pebs_event_reset[0] value and coping with the lost events.
|
2010-03-05 04:49:01 +08:00
|
|
|
*
|
|
|
|
* But taken together it might just make sense to not enable PEBS on
|
|
|
|
* these chips.
|
|
|
|
*/
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_warn("PEBS disabled due to CPU errata\n");
|
2010-03-05 04:49:01 +08:00
|
|
|
x86_pmu.pebs = 0;
|
|
|
|
x86_pmu.pebs_constraints = NULL;
|
|
|
|
}
|
|
|
|
|
2012-06-08 20:50:50 +08:00
|
|
|
static int intel_snb_pebs_broken(int cpu)
|
|
|
|
{
|
|
|
|
u32 rev = UINT_MAX; /* default to broken for unknown models */
|
|
|
|
|
|
|
|
switch (cpu_data(cpu).x86_model) {
|
|
|
|
case 42: /* SNB */
|
|
|
|
rev = 0x28;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 45: /* SNB-EP */
|
|
|
|
switch (cpu_data(cpu).x86_mask) {
|
|
|
|
case 6: rev = 0x618; break;
|
|
|
|
case 7: rev = 0x70c; break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (cpu_data(cpu).microcode < rev);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_snb_check_microcode(void)
|
|
|
|
{
|
|
|
|
int pebs_broken = 0;
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
get_online_cpus();
|
|
|
|
for_each_online_cpu(cpu) {
|
|
|
|
if ((pebs_broken = intel_snb_pebs_broken(cpu)))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
put_online_cpus();
|
|
|
|
|
|
|
|
if (pebs_broken == x86_pmu.pebs_broken)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Serialized by the microcode lock..
|
|
|
|
*/
|
|
|
|
if (x86_pmu.pebs_broken) {
|
|
|
|
pr_info("PEBS enabled due to microcode update\n");
|
|
|
|
x86_pmu.pebs_broken = 0;
|
|
|
|
} else {
|
|
|
|
pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
|
|
|
|
x86_pmu.pebs_broken = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-15 03:25:56 +08:00
|
|
|
/*
|
|
|
|
* Under certain circumstances, access certain MSR may cause #GP.
|
|
|
|
* The function tests if the input MSR can be safely accessed.
|
|
|
|
*/
|
|
|
|
static bool check_msr(unsigned long msr, u64 mask)
|
|
|
|
{
|
|
|
|
u64 val_old, val_new, val_tmp;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read the current value, change it and read it back to see if it
|
|
|
|
* matches, this is needed to detect certain hardware emulators
|
|
|
|
* (qemu/kvm) that don't trap on the MSR access and always return 0s.
|
|
|
|
*/
|
|
|
|
if (rdmsrl_safe(msr, &val_old))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only change the bits which can be updated by wrmsrl.
|
|
|
|
*/
|
|
|
|
val_tmp = val_old ^ mask;
|
|
|
|
if (wrmsrl_safe(msr, val_tmp) ||
|
|
|
|
rdmsrl_safe(msr, &val_new))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (val_new != val_tmp)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* Here it's sure that the MSR can be safely accessed.
|
|
|
|
* Restore the old value and return.
|
|
|
|
*/
|
|
|
|
wrmsrl(msr, val_old);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
static __init void intel_sandybridge_quirk(void)
|
2011-11-15 17:51:15 +08:00
|
|
|
{
|
2012-06-08 20:50:50 +08:00
|
|
|
x86_pmu.check_microcode = intel_snb_check_microcode;
|
|
|
|
intel_snb_check_microcode();
|
2011-11-15 17:51:15 +08:00
|
|
|
}
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
|
|
|
|
{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
|
|
|
|
{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
|
|
|
|
{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
|
|
|
|
{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
|
|
|
|
{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
|
|
|
|
{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
|
|
|
|
{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
|
2011-11-10 20:57:26 +08:00
|
|
|
};
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
static __init void intel_arch_events_quirk(void)
|
|
|
|
{
|
|
|
|
int bit;
|
|
|
|
|
|
|
|
/* disable event that reported as not presend by cpuid */
|
|
|
|
for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
|
|
|
|
intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_warn("CPUID marked event: \'%s\' unavailable\n",
|
|
|
|
intel_arch_events_map[bit].name);
|
2011-12-06 21:07:15 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static __init void intel_nehalem_quirk(void)
|
|
|
|
{
|
|
|
|
union cpuid10_ebx ebx;
|
|
|
|
|
|
|
|
ebx.full = x86_pmu.events_maskl;
|
|
|
|
if (ebx.split.no_branch_misses_retired) {
|
|
|
|
/*
|
|
|
|
* Erratum AAJ80 detected, we work it around by using
|
|
|
|
* the BR_MISP_EXEC.ANY event. This will over-count
|
|
|
|
* branch-misses, but it's still much better than the
|
|
|
|
* architectural event which is often completely bogus:
|
|
|
|
*/
|
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
|
|
|
|
ebx.split.no_branch_misses_retired = 0;
|
|
|
|
x86_pmu.events_maskl = ebx.full;
|
2012-05-22 10:50:07 +08:00
|
|
|
pr_info("CPU erratum AAJ80 worked around\n");
|
2011-12-06 21:07:15 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-18 03:06:59 +08:00
|
|
|
/*
|
|
|
|
* enable software workaround for errata:
|
|
|
|
* SNB: BJ122
|
|
|
|
* IVB: BV98
|
|
|
|
* HSW: HSD29
|
|
|
|
*
|
|
|
|
* Only needed when HT is enabled. However detecting
|
2014-11-18 03:07:04 +08:00
|
|
|
* if HT is enabled is difficult (model specific). So instead,
|
|
|
|
* we enable the workaround in the early boot, and verify if
|
|
|
|
* it is needed in a later initcall phase once we have valid
|
|
|
|
* topology information to check if HT is actually enabled
|
2014-11-18 03:06:59 +08:00
|
|
|
*/
|
|
|
|
static __init void intel_ht_bug(void)
|
|
|
|
{
|
2014-11-18 03:07:04 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
|
2014-11-18 03:06:59 +08:00
|
|
|
|
|
|
|
x86_pmu.start_scheduling = intel_start_scheduling;
|
2015-05-21 16:57:32 +08:00
|
|
|
x86_pmu.commit_scheduling = intel_commit_scheduling;
|
2014-11-18 03:06:59 +08:00
|
|
|
x86_pmu.stop_scheduling = intel_stop_scheduling;
|
|
|
|
}
|
|
|
|
|
2013-09-13 01:17:00 +08:00
|
|
|
EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
|
|
|
|
EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")
|
2013-06-18 08:36:52 +08:00
|
|
|
|
2013-09-06 11:37:40 +08:00
|
|
|
/* Haswell special events */
|
2013-09-13 01:17:00 +08:00
|
|
|
EVENT_ATTR_STR(tx-start, tx_start, "event=0xc9,umask=0x1");
|
|
|
|
EVENT_ATTR_STR(tx-commit, tx_commit, "event=0xc9,umask=0x2");
|
|
|
|
EVENT_ATTR_STR(tx-abort, tx_abort, "event=0xc9,umask=0x4");
|
|
|
|
EVENT_ATTR_STR(tx-capacity, tx_capacity, "event=0x54,umask=0x2");
|
|
|
|
EVENT_ATTR_STR(tx-conflict, tx_conflict, "event=0x54,umask=0x1");
|
|
|
|
EVENT_ATTR_STR(el-start, el_start, "event=0xc8,umask=0x1");
|
|
|
|
EVENT_ATTR_STR(el-commit, el_commit, "event=0xc8,umask=0x2");
|
|
|
|
EVENT_ATTR_STR(el-abort, el_abort, "event=0xc8,umask=0x4");
|
|
|
|
EVENT_ATTR_STR(el-capacity, el_capacity, "event=0x54,umask=0x2");
|
|
|
|
EVENT_ATTR_STR(el-conflict, el_conflict, "event=0x54,umask=0x1");
|
|
|
|
EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1");
|
|
|
|
EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1");
|
2013-09-06 11:37:40 +08:00
|
|
|
|
2013-06-18 08:36:52 +08:00
|
|
|
static struct attribute *hsw_events_attrs[] = {
|
2013-09-06 11:37:40 +08:00
|
|
|
EVENT_PTR(tx_start),
|
|
|
|
EVENT_PTR(tx_commit),
|
|
|
|
EVENT_PTR(tx_abort),
|
|
|
|
EVENT_PTR(tx_capacity),
|
|
|
|
EVENT_PTR(tx_conflict),
|
|
|
|
EVENT_PTR(el_start),
|
|
|
|
EVENT_PTR(el_commit),
|
|
|
|
EVENT_PTR(el_abort),
|
|
|
|
EVENT_PTR(el_capacity),
|
|
|
|
EVENT_PTR(el_conflict),
|
|
|
|
EVENT_PTR(cycles_t),
|
|
|
|
EVENT_PTR(cycles_ct),
|
2013-06-18 08:36:52 +08:00
|
|
|
EVENT_PTR(mem_ld_hsw),
|
|
|
|
EVENT_PTR(mem_st_hsw),
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
__init int intel_pmu_init(void)
|
2010-02-26 19:05:05 +08:00
|
|
|
{
|
|
|
|
union cpuid10_edx edx;
|
|
|
|
union cpuid10_eax eax;
|
2011-11-10 20:57:26 +08:00
|
|
|
union cpuid10_ebx ebx;
|
2012-06-21 02:46:34 +08:00
|
|
|
struct event_constraint *c;
|
2010-02-26 19:05:05 +08:00
|
|
|
unsigned int unused;
|
2014-07-15 03:25:56 +08:00
|
|
|
struct extra_reg *er;
|
|
|
|
int version, i;
|
2010-02-26 19:05:05 +08:00
|
|
|
|
|
|
|
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
|
2010-03-12 00:54:39 +08:00
|
|
|
switch (boot_cpu_data.x86) {
|
|
|
|
case 0x6:
|
|
|
|
return p6_pmu_init();
|
2012-09-27 02:12:52 +08:00
|
|
|
case 0xb:
|
|
|
|
return knc_pmu_init();
|
2010-03-12 00:54:39 +08:00
|
|
|
case 0xf:
|
|
|
|
return p4_pmu_init();
|
|
|
|
}
|
2010-02-26 19:05:05 +08:00
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether the Architectural PerfMon supports
|
|
|
|
* Branch Misses Retired hw_event or not.
|
|
|
|
*/
|
2011-11-10 20:57:26 +08:00
|
|
|
cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
|
|
|
|
if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
|
2010-02-26 19:05:05 +08:00
|
|
|
return -ENODEV;
|
|
|
|
|
|
|
|
version = eax.split.version_id;
|
|
|
|
if (version < 2)
|
|
|
|
x86_pmu = core_pmu;
|
|
|
|
else
|
|
|
|
x86_pmu = intel_pmu;
|
|
|
|
|
|
|
|
x86_pmu.version = version;
|
2010-03-30 00:36:50 +08:00
|
|
|
x86_pmu.num_counters = eax.split.num_counters;
|
|
|
|
x86_pmu.cntval_bits = eax.split.bit_width;
|
|
|
|
x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
x86_pmu.events_maskl = ebx.full;
|
|
|
|
x86_pmu.events_mask_len = eax.split.mask_length;
|
|
|
|
|
2012-06-06 08:56:48 +08:00
|
|
|
x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
|
|
|
* Quirk: v2 perfmon does not report fixed-purpose events, so
|
|
|
|
* assume at least 3 events:
|
|
|
|
*/
|
|
|
|
if (version > 1)
|
2010-03-30 00:36:50 +08:00
|
|
|
x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2014-02-03 21:29:03 +08:00
|
|
|
if (boot_cpu_has(X86_FEATURE_PDCM)) {
|
2010-03-04 00:07:40 +08:00
|
|
|
u64 capabilities;
|
|
|
|
|
|
|
|
rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
|
|
|
|
x86_pmu.intel_cap.capabilities = capabilities;
|
|
|
|
}
|
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
intel_ds_init();
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
/*
|
|
|
|
* Install the hw-cache-events table:
|
|
|
|
*/
|
|
|
|
switch (boot_cpu_data.x86_model) {
|
2014-07-30 18:08:56 +08:00
|
|
|
case 14: /* 65nm Core "Yonah" */
|
2010-02-26 19:05:05 +08:00
|
|
|
pr_cont("Core events, ");
|
|
|
|
break;
|
|
|
|
|
2014-07-30 18:08:56 +08:00
|
|
|
case 15: /* 65nm Core2 "Merom" */
|
2011-12-06 21:07:15 +08:00
|
|
|
x86_add_quirk(intel_clovertown_quirk);
|
2014-07-30 18:08:56 +08:00
|
|
|
case 22: /* 65nm Core2 "Merom-L" */
|
|
|
|
case 23: /* 45nm Core2 "Penryn" */
|
|
|
|
case 29: /* 45nm Core2 "Dunnington (MP) */
|
2010-02-26 19:05:05 +08:00
|
|
|
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
|
2010-03-03 19:02:30 +08:00
|
|
|
intel_pmu_lbr_init_core();
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
x86_pmu.event_constraints = intel_core2_event_constraints;
|
2011-03-02 23:05:01 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
|
2010-02-26 19:05:05 +08:00
|
|
|
pr_cont("Core2 events, ");
|
|
|
|
break;
|
|
|
|
|
2014-07-30 18:08:56 +08:00
|
|
|
case 30: /* 45nm Nehalem */
|
|
|
|
case 26: /* 45nm Nehalem-EP */
|
|
|
|
case 46: /* 45nm Nehalem-EX */
|
2010-02-26 19:05:05 +08:00
|
|
|
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
2011-03-03 10:34:48 +08:00
|
|
|
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2010-03-03 19:02:30 +08:00
|
|
|
intel_pmu_lbr_init_nhm();
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
x86_pmu.event_constraints = intel_nehalem_event_constraints;
|
2011-03-02 23:05:01 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
|
2010-03-26 21:08:44 +08:00
|
|
|
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
|
2011-03-03 10:34:47 +08:00
|
|
|
x86_pmu.extra_regs = intel_nehalem_extra_regs;
|
2011-04-27 17:51:41 +08:00
|
|
|
|
2013-01-24 23:10:32 +08:00
|
|
|
x86_pmu.cpu_events = nhm_events_attrs;
|
|
|
|
|
2011-04-29 20:17:19 +08:00
|
|
|
/* UOPS_ISSUED.STALLED_CYCLES */
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
|
|
|
|
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
|
2011-04-29 20:17:19 +08:00
|
|
|
/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
|
|
|
|
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
|
2011-04-24 14:18:31 +08:00
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
x86_add_quirk(intel_nehalem_quirk);
|
2011-04-27 17:51:41 +08:00
|
|
|
|
2010-03-26 21:08:44 +08:00
|
|
|
pr_cont("Nehalem events, ");
|
2010-02-26 19:05:05 +08:00
|
|
|
break;
|
2010-03-03 19:02:30 +08:00
|
|
|
|
2014-07-30 18:08:56 +08:00
|
|
|
case 28: /* 45nm Atom "Pineview" */
|
|
|
|
case 38: /* 45nm Atom "Lincroft" */
|
|
|
|
case 39: /* 32nm Atom "Penwell" */
|
|
|
|
case 53: /* 32nm Atom "Cloverview" */
|
|
|
|
case 54: /* 32nm Atom "Cedarview" */
|
2010-02-26 19:05:05 +08:00
|
|
|
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
|
2010-03-03 19:02:30 +08:00
|
|
|
intel_pmu_lbr_init_atom();
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
x86_pmu.event_constraints = intel_gen_event_constraints;
|
2011-03-02 23:05:01 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
|
2015-12-04 04:03:10 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
|
2010-02-26 19:05:05 +08:00
|
|
|
pr_cont("Atom events, ");
|
|
|
|
break;
|
|
|
|
|
2014-07-30 18:08:56 +08:00
|
|
|
case 55: /* 22nm Atom "Silvermont" */
|
2015-01-22 15:50:53 +08:00
|
|
|
case 76: /* 14nm Atom "Airmont" */
|
2014-07-30 18:08:56 +08:00
|
|
|
case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
|
2013-07-18 17:02:24 +08:00
|
|
|
memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
|
|
|
|
|
|
|
intel_pmu_lbr_init_atom();
|
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_slm_event_constraints;
|
|
|
|
x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_slm_extra_regs;
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
2013-07-18 17:02:24 +08:00
|
|
|
pr_cont("Silvermont events, ");
|
|
|
|
break;
|
|
|
|
|
2014-07-30 18:08:56 +08:00
|
|
|
case 37: /* 32nm Westmere */
|
|
|
|
case 44: /* 32nm Westmere-EP */
|
|
|
|
case 47: /* 32nm Westmere-EX */
|
2010-02-26 19:05:05 +08:00
|
|
|
memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
2011-03-03 10:34:48 +08:00
|
|
|
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
2010-02-26 19:05:05 +08:00
|
|
|
|
2010-03-03 19:02:30 +08:00
|
|
|
intel_pmu_lbr_init_nhm();
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
x86_pmu.event_constraints = intel_westmere_event_constraints;
|
2010-03-29 22:37:17 +08:00
|
|
|
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
|
2011-03-02 23:05:01 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
|
2011-03-03 10:34:47 +08:00
|
|
|
x86_pmu.extra_regs = intel_westmere_extra_regs;
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
2011-04-30 15:14:54 +08:00
|
|
|
|
2013-01-24 23:10:32 +08:00
|
|
|
x86_pmu.cpu_events = nhm_events_attrs;
|
|
|
|
|
2011-04-30 15:14:54 +08:00
|
|
|
/* UOPS_ISSUED.STALLED_CYCLES */
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
|
|
|
|
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
|
2011-04-30 15:14:54 +08:00
|
|
|
/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
|
|
|
|
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
|
2011-04-30 15:14:54 +08:00
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
pr_cont("Westmere events, ");
|
|
|
|
break;
|
2010-02-01 22:36:30 +08:00
|
|
|
|
2014-07-30 18:08:56 +08:00
|
|
|
case 42: /* 32nm SandyBridge */
|
|
|
|
case 45: /* 32nm SandyBridge-E/EN/EP */
|
2012-06-05 16:26:43 +08:00
|
|
|
x86_add_quirk(intel_sandybridge_quirk);
|
2014-11-18 03:06:59 +08:00
|
|
|
x86_add_quirk(intel_ht_bug);
|
2011-03-02 21:27:04 +08:00
|
|
|
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
2012-07-17 17:27:55 +08:00
|
|
|
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
2011-03-02 21:27:04 +08:00
|
|
|
|
2012-02-10 06:20:55 +08:00
|
|
|
intel_pmu_lbr_init_snb();
|
2011-03-02 21:27:04 +08:00
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_snb_event_constraints;
|
2011-08-31 07:41:05 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
|
2012-06-05 16:26:43 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
|
2013-04-16 19:51:43 +08:00
|
|
|
if (boot_cpu_data.x86_model == 45)
|
|
|
|
x86_pmu.extra_regs = intel_snbep_extra_regs;
|
|
|
|
else
|
|
|
|
x86_pmu.extra_regs = intel_snb_extra_regs;
|
2014-11-18 03:06:59 +08:00
|
|
|
|
|
|
|
|
2011-06-06 22:57:12 +08:00
|
|
|
/* all extra regs are per-cpu when HT is on */
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
2011-05-06 15:14:02 +08:00
|
|
|
|
2013-01-24 23:10:32 +08:00
|
|
|
x86_pmu.cpu_events = snb_events_attrs;
|
|
|
|
|
2011-05-06 15:14:02 +08:00
|
|
|
/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
|
|
|
|
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
|
2011-05-06 15:14:02 +08:00
|
|
|
/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
|
2012-03-12 19:44:35 +08:00
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
|
|
|
|
X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
|
2011-05-06 15:14:02 +08:00
|
|
|
|
2011-03-02 21:27:04 +08:00
|
|
|
pr_cont("SandyBridge events, ");
|
|
|
|
break;
|
2014-07-30 18:08:56 +08:00
|
|
|
|
|
|
|
case 58: /* 22nm IvyBridge */
|
|
|
|
case 62: /* 22nm IvyBridge-EP/EX */
|
2014-11-18 03:06:59 +08:00
|
|
|
x86_add_quirk(intel_ht_bug);
|
2012-09-11 07:07:01 +08:00
|
|
|
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
|
|
|
|
sizeof(hw_cache_event_ids));
|
2014-07-15 03:33:25 +08:00
|
|
|
/* dTLB-load-misses on IVB is different than SNB */
|
|
|
|
hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
|
|
|
|
|
2012-09-11 07:07:01 +08:00
|
|
|
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
|
|
|
|
sizeof(hw_cache_extra_regs));
|
|
|
|
|
|
|
|
intel_pmu_lbr_init_snb();
|
|
|
|
|
2013-02-20 18:15:12 +08:00
|
|
|
x86_pmu.event_constraints = intel_ivb_event_constraints;
|
2012-09-11 07:07:01 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
2013-04-16 19:51:43 +08:00
|
|
|
if (boot_cpu_data.x86_model == 62)
|
|
|
|
x86_pmu.extra_regs = intel_snbep_extra_regs;
|
|
|
|
else
|
|
|
|
x86_pmu.extra_regs = intel_snb_extra_regs;
|
2012-09-11 07:07:01 +08:00
|
|
|
/* all extra regs are per-cpu when HT is on */
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
2012-09-11 07:07:01 +08:00
|
|
|
|
2013-01-24 23:10:32 +08:00
|
|
|
x86_pmu.cpu_events = snb_events_attrs;
|
|
|
|
|
2012-09-11 07:07:01 +08:00
|
|
|
/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
|
|
|
|
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
|
|
|
|
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
|
|
|
|
|
|
|
|
pr_cont("IvyBridge events, ");
|
|
|
|
break;
|
|
|
|
|
2011-03-02 21:27:04 +08:00
|
|
|
|
2014-09-03 02:44:12 +08:00
|
|
|
case 60: /* 22nm Haswell Core */
|
|
|
|
case 63: /* 22nm Haswell Server */
|
|
|
|
case 69: /* 22nm Haswell ULT */
|
|
|
|
case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
|
2014-11-18 03:06:59 +08:00
|
|
|
x86_add_quirk(intel_ht_bug);
|
2013-06-18 08:36:50 +08:00
|
|
|
x86_pmu.late_ack = true;
|
2015-02-18 10:18:04 +08:00
|
|
|
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
2013-06-18 08:36:48 +08:00
|
|
|
|
perf/x86/intel: Add basic Haswell LBR call stack support
Haswell has a new feature that utilizes the existing LBR facility to
record call chains. To enable this feature, bits (JCC, NEAR_IND_JMP,
NEAR_REL_JMP, FAR_BRANCH, EN_CALLSTACK) in LBR_SELECT must be set to 1,
bits (NEAR_REL_CALL, NEAR-IND_CALL, NEAR_RET) must be cleared. Due to
a hardware bug of Haswell, this feature doesn't work well with
FREEZE_LBRS_ON_PMI.
When the call stack feature is enabled, the LBR stack will capture
unfiltered call data normally, but as return instructions are executed,
the last captured branch record is flushed from the on-chip registers
in a last-in first-out (LIFO) manner. Thus, branch information relative
to leaf functions will not be captured, while preserving the call stack
information of the main line execution path.
This patch defines a separate lbr_sel map for Haswell. The map contains
a new entry for the call stack feature.
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-5-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-05 10:56:00 +08:00
|
|
|
intel_pmu_lbr_init_hsw();
|
2013-06-18 08:36:48 +08:00
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_hsw_event_constraints;
|
2013-06-18 08:36:49 +08:00
|
|
|
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
|
2014-08-01 05:05:22 +08:00
|
|
|
x86_pmu.extra_regs = intel_snbep_extra_regs;
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
2013-06-18 08:36:48 +08:00
|
|
|
/* all extra regs are per-cpu when HT is on */
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
2013-06-18 08:36:48 +08:00
|
|
|
|
|
|
|
x86_pmu.hw_config = hsw_hw_config;
|
|
|
|
x86_pmu.get_event_constraints = hsw_get_event_constraints;
|
2013-06-18 08:36:52 +08:00
|
|
|
x86_pmu.cpu_events = hsw_events_attrs;
|
2013-09-20 22:40:44 +08:00
|
|
|
x86_pmu.lbr_double_abort = true;
|
2013-06-18 08:36:48 +08:00
|
|
|
pr_cont("Haswell events, ");
|
|
|
|
break;
|
|
|
|
|
2015-02-18 10:18:05 +08:00
|
|
|
case 61: /* 14nm Broadwell Core-M */
|
|
|
|
case 86: /* 14nm Broadwell Xeon D */
|
2015-06-12 04:52:22 +08:00
|
|
|
case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
|
|
|
|
case 79: /* 14nm Broadwell Server */
|
2015-02-18 10:18:05 +08:00
|
|
|
x86_pmu.late_ack = true;
|
|
|
|
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
|
|
|
|
|
|
|
/* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
|
|
|
|
hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
|
|
|
|
BDW_L3_MISS|HSW_SNOOP_DRAM;
|
|
|
|
hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
|
|
|
|
HSW_SNOOP_DRAM;
|
|
|
|
hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
|
|
|
|
BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
|
|
|
|
hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
|
|
|
|
BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
|
|
|
|
|
2015-04-02 16:12:57 +08:00
|
|
|
intel_pmu_lbr_init_hsw();
|
2015-02-18 10:18:05 +08:00
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_bdw_event_constraints;
|
|
|
|
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_snbep_extra_regs;
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
2015-02-18 10:18:05 +08:00
|
|
|
/* all extra regs are per-cpu when HT is on */
|
2014-11-18 03:06:53 +08:00
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
2015-02-18 10:18:05 +08:00
|
|
|
|
|
|
|
x86_pmu.hw_config = hsw_hw_config;
|
|
|
|
x86_pmu.get_event_constraints = hsw_get_event_constraints;
|
|
|
|
x86_pmu.cpu_events = hsw_events_attrs;
|
perf/x86/intel: Add INST_RETIRED.ALL workarounds
On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.
This is erratum BDM11 and BDM55:
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.
Add a new callback to enforce this, and set it for Broadwell.
How does this handle the case when an app requests a specific
period with some of the bottom bits set?
Short answer:
Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.
Long answer (by Peterz):
IFF we guarantee perf_event_attr::sample_period >= 128.
Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.
So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.
So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.
So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-02-18 10:18:06 +08:00
|
|
|
x86_pmu.limit_period = bdw_limit_period;
|
2015-02-18 10:18:05 +08:00
|
|
|
pr_cont("Broadwell events, ");
|
|
|
|
break;
|
|
|
|
|
2015-12-08 06:28:18 +08:00
|
|
|
case 87: /* Knights Landing Xeon Phi */
|
|
|
|
memcpy(hw_cache_event_ids,
|
|
|
|
slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs,
|
|
|
|
knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
|
|
|
intel_pmu_lbr_init_knl();
|
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_slm_event_constraints;
|
|
|
|
x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_knl_extra_regs;
|
|
|
|
|
|
|
|
/* all extra regs are per-cpu when HT is on */
|
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
|
|
|
|
|
|
|
pr_cont("Knights Landing events, ");
|
|
|
|
break;
|
|
|
|
|
2015-05-11 03:22:44 +08:00
|
|
|
case 78: /* 14nm Skylake Mobile */
|
|
|
|
case 94: /* 14nm Skylake Desktop */
|
|
|
|
x86_pmu.late_ack = true;
|
|
|
|
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
|
|
|
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
|
|
|
intel_pmu_lbr_init_skl();
|
|
|
|
|
|
|
|
x86_pmu.event_constraints = intel_skl_event_constraints;
|
|
|
|
x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
|
|
|
|
x86_pmu.extra_regs = intel_skl_extra_regs;
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
|
|
|
|
x86_pmu.pebs_prec_dist = true;
|
2015-05-11 03:22:44 +08:00
|
|
|
/* all extra regs are per-cpu when HT is on */
|
|
|
|
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
|
|
|
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
|
|
|
|
|
|
|
x86_pmu.hw_config = hsw_hw_config;
|
|
|
|
x86_pmu.get_event_constraints = hsw_get_event_constraints;
|
2015-09-10 05:53:59 +08:00
|
|
|
x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
|
|
|
|
skl_format_attr);
|
2015-05-11 03:22:44 +08:00
|
|
|
WARN_ON(!x86_pmu.format_attrs);
|
|
|
|
x86_pmu.cpu_events = hsw_events_attrs;
|
|
|
|
pr_cont("Skylake events, ");
|
|
|
|
break;
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
default:
|
2011-06-29 23:42:36 +08:00
|
|
|
switch (x86_pmu.version) {
|
|
|
|
case 1:
|
|
|
|
x86_pmu.event_constraints = intel_v1_event_constraints;
|
|
|
|
pr_cont("generic architected perfmon v1, ");
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
/*
|
|
|
|
* default constraints for v2 and up
|
|
|
|
*/
|
|
|
|
x86_pmu.event_constraints = intel_gen_event_constraints;
|
|
|
|
pr_cont("generic architected perfmon, ");
|
|
|
|
break;
|
|
|
|
}
|
2010-02-26 19:05:05 +08:00
|
|
|
}
|
2011-11-10 20:57:26 +08:00
|
|
|
|
2012-06-21 02:46:34 +08:00
|
|
|
if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
|
|
|
|
WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
|
|
|
|
x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
|
|
|
|
x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
|
|
|
|
}
|
|
|
|
x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
|
|
|
|
|
|
|
|
if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
|
|
|
|
WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
|
|
|
|
x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
|
|
|
|
x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
|
|
|
|
}
|
|
|
|
|
|
|
|
x86_pmu.intel_ctrl |=
|
|
|
|
((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
|
|
|
|
|
|
|
|
if (x86_pmu.event_constraints) {
|
|
|
|
/*
|
|
|
|
* event on fixed counter2 (REF_CYCLES) only works on this
|
|
|
|
* counter, so do not extend mask to generic counters
|
|
|
|
*/
|
|
|
|
for_each_event_constraint(c, x86_pmu.event_constraints) {
|
2015-06-08 20:46:49 +08:00
|
|
|
if (c->cmask == FIXED_EVENT_FLAGS
|
|
|
|
&& c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
|
|
|
|
c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
|
2012-06-21 02:46:34 +08:00
|
|
|
}
|
2015-06-08 20:46:49 +08:00
|
|
|
c->idxmsk64 &=
|
|
|
|
~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
|
|
|
|
c->weight = hweight64(c->idxmsk64);
|
2012-06-21 02:46:34 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-15 03:25:56 +08:00
|
|
|
/*
|
|
|
|
* Access LBR MSR may cause #GP under certain circumstances.
|
|
|
|
* E.g. KVM doesn't support LBR MSR
|
|
|
|
* Check all LBT MSR here.
|
|
|
|
* Disable LBR access if any LBR MSRs can not be accessed.
|
|
|
|
*/
|
|
|
|
if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
|
|
|
|
x86_pmu.lbr_nr = 0;
|
|
|
|
for (i = 0; i < x86_pmu.lbr_nr; i++) {
|
|
|
|
if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
|
|
|
|
check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
|
|
|
|
x86_pmu.lbr_nr = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Access extra MSR may cause #GP under certain circumstances.
|
|
|
|
* E.g. KVM doesn't support offcore event
|
|
|
|
* Check all extra_regs here.
|
|
|
|
*/
|
|
|
|
if (x86_pmu.extra_regs) {
|
|
|
|
for (er = x86_pmu.extra_regs; er->msr; er++) {
|
2015-07-01 07:33:24 +08:00
|
|
|
er->extra_msr_access = check_msr(er->msr, 0x11UL);
|
2014-07-15 03:25:56 +08:00
|
|
|
/* Disable LBR select mapping */
|
|
|
|
if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
|
|
|
|
x86_pmu.lbr_sel_map = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-06-25 23:12:33 +08:00
|
|
|
/* Support full width counters using alternative MSR range */
|
|
|
|
if (x86_pmu.intel_cap.full_width_write) {
|
|
|
|
x86_pmu.max_period = x86_pmu.cntval_mask;
|
|
|
|
x86_pmu.perfctr = MSR_IA32_PMC0;
|
|
|
|
pr_cont("full-width counters, ");
|
|
|
|
}
|
|
|
|
|
2010-02-26 19:05:05 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2014-11-18 03:07:04 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* HT bug: phase 2 init
|
|
|
|
* Called once we have valid topology information to check
|
|
|
|
* whether or not HT is enabled
|
|
|
|
* If HT is off, then we disable the workaround
|
|
|
|
*/
|
|
|
|
static __init int fixup_ht_bug(void)
|
|
|
|
{
|
|
|
|
int cpu = smp_processor_id();
|
|
|
|
int w, c;
|
|
|
|
/*
|
|
|
|
* problem not present on this CPU model, nothing to do
|
|
|
|
*/
|
|
|
|
if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
|
|
|
|
return 0;
|
|
|
|
|
2015-05-26 21:11:28 +08:00
|
|
|
w = cpumask_weight(topology_sibling_cpumask(cpu));
|
2014-11-18 03:07:04 +08:00
|
|
|
if (w > 1) {
|
|
|
|
pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-09-05 06:45:28 +08:00
|
|
|
if (lockup_detector_suspend() != 0) {
|
2015-09-05 06:45:25 +08:00
|
|
|
pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
|
|
|
|
return 0;
|
|
|
|
}
|
2014-11-18 03:07:04 +08:00
|
|
|
|
|
|
|
x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
|
|
|
|
|
|
|
|
x86_pmu.start_scheduling = NULL;
|
2015-05-21 16:57:32 +08:00
|
|
|
x86_pmu.commit_scheduling = NULL;
|
2014-11-18 03:07:04 +08:00
|
|
|
x86_pmu.stop_scheduling = NULL;
|
|
|
|
|
2015-09-05 06:45:28 +08:00
|
|
|
lockup_detector_resume();
|
2014-11-18 03:07:04 +08:00
|
|
|
|
|
|
|
get_online_cpus();
|
|
|
|
|
|
|
|
for_each_online_cpu(c) {
|
|
|
|
free_excl_cntrs(c);
|
|
|
|
}
|
|
|
|
|
|
|
|
put_online_cpus();
|
|
|
|
pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
subsys_initcall(fixup_ht_bug)
|