perf/core: Fix perf_event_read()

perf_event_read() has a number of issues regarding the timekeeping bits.

 - The IPI didn't update group times when it found INACTIVE

 - The direct call would not re-check ->state after taking ctx->lock
   which can result in ->count and timestamps getting out of sync.

And we can make use of the ordering introduced for perf_event_stop()
to make it more accurate for ACTIVE.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Peter Zijlstra 2017-09-05 16:26:44 +02:00 committed by Ingo Molnar
parent 7f0ec32526
commit 0c1cbc18df
1 changed files with 39 additions and 16 deletions

View File

@ -2081,8 +2081,9 @@ event_sched_in(struct perf_event *event,
WRITE_ONCE(event->oncpu, smp_processor_id()); WRITE_ONCE(event->oncpu, smp_processor_id());
/* /*
* Order event::oncpu write to happen before the ACTIVE state * Order event::oncpu write to happen before the ACTIVE state is
* is visible. * visible. This allows perf_event_{stop,read}() to observe the correct
* ->oncpu if it sees ACTIVE.
*/ */
smp_wmb(); smp_wmb();
WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
@ -3638,12 +3639,16 @@ static void __perf_event_read(void *info)
return; return;
raw_spin_lock(&ctx->lock); raw_spin_lock(&ctx->lock);
if (ctx->is_active) { if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx); update_context_time(ctx);
update_cgrp_time_from_event(event); update_cgrp_time_from_event(event);
} }
update_event_times(event); if (!data->group)
update_event_times(event);
else
update_group_times(event);
if (event->state != PERF_EVENT_STATE_ACTIVE) if (event->state != PERF_EVENT_STATE_ACTIVE)
goto unlock; goto unlock;
@ -3658,7 +3663,6 @@ static void __perf_event_read(void *info)
pmu->read(event); pmu->read(event);
list_for_each_entry(sub, &event->sibling_list, group_entry) { list_for_each_entry(sub, &event->sibling_list, group_entry) {
update_event_times(sub);
if (sub->state == PERF_EVENT_STATE_ACTIVE) { if (sub->state == PERF_EVENT_STATE_ACTIVE) {
/* /*
* Use sibling's PMU rather than @event's since * Use sibling's PMU rather than @event's since
@ -3748,23 +3752,35 @@ out:
static int perf_event_read(struct perf_event *event, bool group) static int perf_event_read(struct perf_event *event, bool group)
{ {
enum perf_event_state state = READ_ONCE(event->state);
int event_cpu, ret = 0; int event_cpu, ret = 0;
/* /*
* If event is enabled and currently active on a CPU, update the * If event is enabled and currently active on a CPU, update the
* value in the event structure: * value in the event structure:
*/ */
if (event->state == PERF_EVENT_STATE_ACTIVE) { again:
struct perf_read_data data = { if (state == PERF_EVENT_STATE_ACTIVE) {
.event = event, struct perf_read_data data;
.group = group,
.ret = 0, /*
}; * Orders the ->state and ->oncpu loads such that if we see
* ACTIVE we must also see the right ->oncpu.
*
* Matches the smp_wmb() from event_sched_in().
*/
smp_rmb();
event_cpu = READ_ONCE(event->oncpu); event_cpu = READ_ONCE(event->oncpu);
if ((unsigned)event_cpu >= nr_cpu_ids) if ((unsigned)event_cpu >= nr_cpu_ids)
return 0; return 0;
data = (struct perf_read_data){
.event = event,
.group = group,
.ret = 0,
};
preempt_disable(); preempt_disable();
event_cpu = __perf_event_read_cpu(event, event_cpu); event_cpu = __perf_event_read_cpu(event, event_cpu);
@ -3781,20 +3797,27 @@ static int perf_event_read(struct perf_event *event, bool group)
(void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1); (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
preempt_enable(); preempt_enable();
ret = data.ret; ret = data.ret;
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
} else if (state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx; struct perf_event_context *ctx = event->ctx;
unsigned long flags; unsigned long flags;
raw_spin_lock_irqsave(&ctx->lock, flags); raw_spin_lock_irqsave(&ctx->lock, flags);
state = event->state;
if (state != PERF_EVENT_STATE_INACTIVE) {
raw_spin_unlock_irqrestore(&ctx->lock, flags);
goto again;
}
/* /*
* may read while context is not active * May read while context is not active (e.g., thread is
* (e.g., thread is blocked), in that case * blocked), in that case we cannot update context time
* we cannot update context time
*/ */
if (ctx->is_active) { if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx); update_context_time(ctx);
update_cgrp_time_from_event(event); update_cgrp_time_from_event(event);
} }
if (group) if (group)
update_group_times(event); update_group_times(event);
else else