perf: Fix perf_event_exit_task() race
There is a race against perf_event_exit_task() vs event_function_call(),find_get_context(),perf_install_in_context() (iow, everyone). Since there is no permanent marker on a context that its dead, it is quite possible that we access (and even modify) a context after its passed through perf_event_exit_task(). For instance, find_get_context() might find the context still installed, but by the time we get to perf_install_in_context() it might already have passed through perf_event_exit_task() and be considered dead, we will however still add the event to it. Solve this by marking a ctx dead by setting its ctx->task value to -1, it must be !0 so we still know its a (former) task context. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vince Weaver <vincent.weaver@maine.edu> Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
c97f473643
commit
63b6da39bb
|
@ -148,6 +148,13 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
|
||||||
raw_spin_unlock(&cpuctx->ctx.lock);
|
raw_spin_unlock(&cpuctx->ctx.lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define TASK_TOMBSTONE ((void *)-1L)
|
||||||
|
|
||||||
|
static bool is_kernel_event(struct perf_event *event)
|
||||||
|
{
|
||||||
|
return event->owner == TASK_TOMBSTONE;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* On task ctx scheduling...
|
* On task ctx scheduling...
|
||||||
*
|
*
|
||||||
|
@ -196,31 +203,21 @@ static int event_function(void *info)
|
||||||
struct perf_event_context *ctx = event->ctx;
|
struct perf_event_context *ctx = event->ctx;
|
||||||
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
|
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
|
||||||
struct perf_event_context *task_ctx = cpuctx->task_ctx;
|
struct perf_event_context *task_ctx = cpuctx->task_ctx;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
WARN_ON_ONCE(!irqs_disabled());
|
WARN_ON_ONCE(!irqs_disabled());
|
||||||
|
|
||||||
|
perf_ctx_lock(cpuctx, task_ctx);
|
||||||
/*
|
/*
|
||||||
* Since we do the IPI call without holding ctx->lock things can have
|
* Since we do the IPI call without holding ctx->lock things can have
|
||||||
* changed, double check we hit the task we set out to hit.
|
* changed, double check we hit the task we set out to hit.
|
||||||
*
|
|
||||||
* If ctx->task == current, we know things must remain valid because
|
|
||||||
* we have IRQs disabled so we cannot schedule.
|
|
||||||
*/
|
*/
|
||||||
if (ctx->task) {
|
if (ctx->task) {
|
||||||
if (ctx->task != current)
|
if (ctx->task != current) {
|
||||||
return -EAGAIN;
|
ret = -EAGAIN;
|
||||||
|
goto unlock;
|
||||||
|
}
|
||||||
|
|
||||||
WARN_ON_ONCE(task_ctx != ctx);
|
|
||||||
} else {
|
|
||||||
WARN_ON_ONCE(&cpuctx->ctx != ctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
perf_ctx_lock(cpuctx, task_ctx);
|
|
||||||
/*
|
|
||||||
* Now that we hold locks, double check state. Paranoia pays.
|
|
||||||
*/
|
|
||||||
if (task_ctx) {
|
|
||||||
WARN_ON_ONCE(task_ctx->task != current);
|
|
||||||
/*
|
/*
|
||||||
* We only use event_function_call() on established contexts,
|
* We only use event_function_call() on established contexts,
|
||||||
* and event_function() is only ever called when active (or
|
* and event_function() is only ever called when active (or
|
||||||
|
@ -233,12 +230,16 @@ static int event_function(void *info)
|
||||||
* And since we have ctx->is_active, cpuctx->task_ctx must
|
* And since we have ctx->is_active, cpuctx->task_ctx must
|
||||||
* match.
|
* match.
|
||||||
*/
|
*/
|
||||||
WARN_ON_ONCE(cpuctx->task_ctx != task_ctx);
|
WARN_ON_ONCE(task_ctx != ctx);
|
||||||
|
} else {
|
||||||
|
WARN_ON_ONCE(&cpuctx->ctx != ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
efs->func(event, cpuctx, ctx, efs->data);
|
efs->func(event, cpuctx, ctx, efs->data);
|
||||||
|
unlock:
|
||||||
perf_ctx_unlock(cpuctx, task_ctx);
|
perf_ctx_unlock(cpuctx, task_ctx);
|
||||||
|
|
||||||
return 0;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void event_function_local(struct perf_event *event, event_f func, void *data)
|
static void event_function_local(struct perf_event *event, event_f func, void *data)
|
||||||
|
@ -256,7 +257,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
|
||||||
static void event_function_call(struct perf_event *event, event_f func, void *data)
|
static void event_function_call(struct perf_event *event, event_f func, void *data)
|
||||||
{
|
{
|
||||||
struct perf_event_context *ctx = event->ctx;
|
struct perf_event_context *ctx = event->ctx;
|
||||||
struct task_struct *task = ctx->task;
|
struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
|
||||||
struct event_function_struct efs = {
|
struct event_function_struct efs = {
|
||||||
.event = event,
|
.event = event,
|
||||||
.func = func,
|
.func = func,
|
||||||
|
@ -278,30 +279,28 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
|
||||||
}
|
}
|
||||||
|
|
||||||
again:
|
again:
|
||||||
|
if (task == TASK_TOMBSTONE)
|
||||||
|
return;
|
||||||
|
|
||||||
if (!task_function_call(task, event_function, &efs))
|
if (!task_function_call(task, event_function, &efs))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
raw_spin_lock_irq(&ctx->lock);
|
raw_spin_lock_irq(&ctx->lock);
|
||||||
if (ctx->is_active) {
|
/*
|
||||||
/*
|
* Reload the task pointer, it might have been changed by
|
||||||
* Reload the task pointer, it might have been changed by
|
* a concurrent perf_event_context_sched_out().
|
||||||
* a concurrent perf_event_context_sched_out().
|
*/
|
||||||
*/
|
task = ctx->task;
|
||||||
task = ctx->task;
|
if (task != TASK_TOMBSTONE) {
|
||||||
raw_spin_unlock_irq(&ctx->lock);
|
if (ctx->is_active) {
|
||||||
goto again;
|
raw_spin_unlock_irq(&ctx->lock);
|
||||||
|
goto again;
|
||||||
|
}
|
||||||
|
func(event, NULL, ctx, data);
|
||||||
}
|
}
|
||||||
func(event, NULL, ctx, data);
|
|
||||||
raw_spin_unlock_irq(&ctx->lock);
|
raw_spin_unlock_irq(&ctx->lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define EVENT_OWNER_KERNEL ((void *) -1)
|
|
||||||
|
|
||||||
static bool is_kernel_event(struct perf_event *event)
|
|
||||||
{
|
|
||||||
return event->owner == EVENT_OWNER_KERNEL;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
|
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
|
||||||
PERF_FLAG_FD_OUTPUT |\
|
PERF_FLAG_FD_OUTPUT |\
|
||||||
PERF_FLAG_PID_CGROUP |\
|
PERF_FLAG_PID_CGROUP |\
|
||||||
|
@ -1025,7 +1024,7 @@ static void put_ctx(struct perf_event_context *ctx)
|
||||||
if (atomic_dec_and_test(&ctx->refcount)) {
|
if (atomic_dec_and_test(&ctx->refcount)) {
|
||||||
if (ctx->parent_ctx)
|
if (ctx->parent_ctx)
|
||||||
put_ctx(ctx->parent_ctx);
|
put_ctx(ctx->parent_ctx);
|
||||||
if (ctx->task)
|
if (ctx->task && ctx->task != TASK_TOMBSTONE)
|
||||||
put_task_struct(ctx->task);
|
put_task_struct(ctx->task);
|
||||||
call_rcu(&ctx->rcu_head, free_ctx);
|
call_rcu(&ctx->rcu_head, free_ctx);
|
||||||
}
|
}
|
||||||
|
@ -1186,6 +1185,7 @@ static u64 primary_event_id(struct perf_event *event)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Get the perf_event_context for a task and lock it.
|
* Get the perf_event_context for a task and lock it.
|
||||||
|
*
|
||||||
* This has to cope with with the fact that until it is locked,
|
* This has to cope with with the fact that until it is locked,
|
||||||
* the context could get moved to another task.
|
* the context could get moved to another task.
|
||||||
*/
|
*/
|
||||||
|
@ -1226,10 +1226,13 @@ retry:
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!atomic_inc_not_zero(&ctx->refcount)) {
|
if (ctx->task == TASK_TOMBSTONE ||
|
||||||
|
!atomic_inc_not_zero(&ctx->refcount)) {
|
||||||
raw_spin_unlock(&ctx->lock);
|
raw_spin_unlock(&ctx->lock);
|
||||||
ctx = NULL;
|
ctx = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
WARN_ON_ONCE(ctx->task != task);
|
||||||
}
|
}
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
if (!ctx)
|
if (!ctx)
|
||||||
|
@ -2140,23 +2143,27 @@ static int __perf_install_in_context(void *info)
|
||||||
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
|
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
|
||||||
struct perf_event_context *task_ctx = cpuctx->task_ctx;
|
struct perf_event_context *task_ctx = cpuctx->task_ctx;
|
||||||
|
|
||||||
|
raw_spin_lock(&cpuctx->ctx.lock);
|
||||||
if (ctx->task) {
|
if (ctx->task) {
|
||||||
|
raw_spin_lock(&ctx->lock);
|
||||||
/*
|
/*
|
||||||
* If we hit the 'wrong' task, we've since scheduled and
|
* If we hit the 'wrong' task, we've since scheduled and
|
||||||
* everything should be sorted, nothing to do!
|
* everything should be sorted, nothing to do!
|
||||||
*/
|
*/
|
||||||
|
task_ctx = ctx;
|
||||||
if (ctx->task != current)
|
if (ctx->task != current)
|
||||||
return 0;
|
goto unlock;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If task_ctx is set, it had better be to us.
|
* If task_ctx is set, it had better be to us.
|
||||||
*/
|
*/
|
||||||
WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx);
|
WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx);
|
||||||
task_ctx = ctx;
|
} else if (task_ctx) {
|
||||||
|
raw_spin_lock(&task_ctx->lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
perf_ctx_lock(cpuctx, task_ctx);
|
|
||||||
ctx_resched(cpuctx, task_ctx);
|
ctx_resched(cpuctx, task_ctx);
|
||||||
|
unlock:
|
||||||
perf_ctx_unlock(cpuctx, task_ctx);
|
perf_ctx_unlock(cpuctx, task_ctx);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -2188,6 +2195,17 @@ perf_install_in_context(struct perf_event_context *ctx,
|
||||||
* happened and that will have taken care of business.
|
* happened and that will have taken care of business.
|
||||||
*/
|
*/
|
||||||
raw_spin_lock_irq(&ctx->lock);
|
raw_spin_lock_irq(&ctx->lock);
|
||||||
|
task = ctx->task;
|
||||||
|
/*
|
||||||
|
* Worse, we cannot even rely on the ctx actually existing anymore. If
|
||||||
|
* between find_get_context() and perf_install_in_context() the task
|
||||||
|
* went through perf_event_exit_task() its dead and we should not be
|
||||||
|
* adding new events.
|
||||||
|
*/
|
||||||
|
if (task == TASK_TOMBSTONE) {
|
||||||
|
raw_spin_unlock_irq(&ctx->lock);
|
||||||
|
return;
|
||||||
|
}
|
||||||
update_context_time(ctx);
|
update_context_time(ctx);
|
||||||
/*
|
/*
|
||||||
* Update cgrp time only if current cgrp matches event->cgrp.
|
* Update cgrp time only if current cgrp matches event->cgrp.
|
||||||
|
@ -2195,7 +2213,6 @@ perf_install_in_context(struct perf_event_context *ctx,
|
||||||
*/
|
*/
|
||||||
update_cgrp_time_from_event(event);
|
update_cgrp_time_from_event(event);
|
||||||
add_event_to_ctx(event, ctx);
|
add_event_to_ctx(event, ctx);
|
||||||
task = ctx->task;
|
|
||||||
raw_spin_unlock_irq(&ctx->lock);
|
raw_spin_unlock_irq(&ctx->lock);
|
||||||
|
|
||||||
if (task)
|
if (task)
|
||||||
|
@ -2538,17 +2555,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
|
||||||
raw_spin_lock(&ctx->lock);
|
raw_spin_lock(&ctx->lock);
|
||||||
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
|
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
|
||||||
if (context_equiv(ctx, next_ctx)) {
|
if (context_equiv(ctx, next_ctx)) {
|
||||||
/*
|
WRITE_ONCE(ctx->task, next);
|
||||||
* XXX do we need a memory barrier of sorts
|
WRITE_ONCE(next_ctx->task, task);
|
||||||
* wrt to rcu_dereference() of perf_event_ctxp
|
|
||||||
*/
|
|
||||||
task->perf_event_ctxp[ctxn] = next_ctx;
|
|
||||||
next->perf_event_ctxp[ctxn] = ctx;
|
|
||||||
ctx->task = next;
|
|
||||||
next_ctx->task = task;
|
|
||||||
|
|
||||||
swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
|
swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* RCU_INIT_POINTER here is safe because we've not
|
||||||
|
* modified the ctx and the above modification of
|
||||||
|
* ctx->task and ctx->task_ctx_data are immaterial
|
||||||
|
* since those values are always verified under
|
||||||
|
* ctx->lock which we're now holding.
|
||||||
|
*/
|
||||||
|
RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
|
||||||
|
RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
|
||||||
|
|
||||||
do_switch = 0;
|
do_switch = 0;
|
||||||
|
|
||||||
perf_event_sync_stat(ctx, next_ctx);
|
perf_event_sync_stat(ctx, next_ctx);
|
||||||
|
@ -8545,7 +8566,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Mark owner so we could distinguish it from user events. */
|
/* Mark owner so we could distinguish it from user events. */
|
||||||
event->owner = EVENT_OWNER_KERNEL;
|
event->owner = TASK_TOMBSTONE;
|
||||||
|
|
||||||
account_event(event);
|
account_event(event);
|
||||||
|
|
||||||
|
@ -8725,28 +8746,26 @@ __perf_event_exit_task(struct perf_event *child_event,
|
||||||
|
|
||||||
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
|
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
|
||||||
{
|
{
|
||||||
struct perf_event *child_event, *next;
|
|
||||||
struct perf_event_context *child_ctx, *clone_ctx = NULL;
|
struct perf_event_context *child_ctx, *clone_ctx = NULL;
|
||||||
|
struct perf_event *child_event, *next;
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
if (likely(!child->perf_event_ctxp[ctxn]))
|
WARN_ON_ONCE(child != current);
|
||||||
|
|
||||||
|
child_ctx = perf_lock_task_context(child, ctxn, &flags);
|
||||||
|
if (!child_ctx)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
local_irq_disable();
|
task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
|
||||||
WARN_ON_ONCE(child != current);
|
|
||||||
/*
|
|
||||||
* We can't reschedule here because interrupts are disabled,
|
|
||||||
* and child must be current.
|
|
||||||
*/
|
|
||||||
child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Take the context lock here so that if find_get_context is
|
* Now that the context is inactive, destroy the task <-> ctx relation
|
||||||
* reading child->perf_event_ctxp, we wait until it has
|
* and mark the context dead.
|
||||||
* incremented the context's refcount before we do put_ctx below.
|
|
||||||
*/
|
*/
|
||||||
raw_spin_lock(&child_ctx->lock);
|
RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
|
||||||
task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
|
put_ctx(child_ctx); /* cannot be last */
|
||||||
child->perf_event_ctxp[ctxn] = NULL;
|
WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
|
||||||
|
put_task_struct(current); /* cannot be last */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If this context is a clone; unclone it so it can't get
|
* If this context is a clone; unclone it so it can't get
|
||||||
|
@ -8755,7 +8774,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
|
||||||
*/
|
*/
|
||||||
clone_ctx = unclone_ctx(child_ctx);
|
clone_ctx = unclone_ctx(child_ctx);
|
||||||
update_context_time(child_ctx);
|
update_context_time(child_ctx);
|
||||||
raw_spin_unlock_irq(&child_ctx->lock);
|
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
|
||||||
|
|
||||||
if (clone_ctx)
|
if (clone_ctx)
|
||||||
put_ctx(clone_ctx);
|
put_ctx(clone_ctx);
|
||||||
|
|
Loading…
Reference in New Issue