io_uring: cancel sqpoll via task_work

1) The first problem is io_uring_cancel_sqpoll() ->
io_uring_cancel_task_requests() basically doing park(); park(); and so
hanging.

2) Another one is more subtle, when the master task is doing cancellations,
but SQPOLL task submits in-between the end of the cancellation but
before finish() requests taking a ref to the ctx, and so eternally
locking it up.

3) Yet another is a dying SQPOLL task doing io_uring_cancel_sqpoll() and
same io_uring_cancel_sqpoll() from the owner task, they race for
tctx->wait events. And there probably more of them.

Instead do SQPOLL cancellations from within SQPOLL task context via
task_work, see io_sqpoll_cancel_sync(). With that we don't need temporal
park()/unpark() during cancellation, which is ugly, subtle and anyway
doesn't allow to do io_run_task_work() properly.

io_uring_cancel_sqpoll() is called only from SQPOLL task context and
under sqd locking, so all parking is removed from there. And so,
io_sq_thread_[un]park() and io_sq_thread_stop() are not used now by
SQPOLL task, and that spare us from some headache.

Also remove ctx->sqd_list early to avoid 2). And kill tctx->sqpoll,
which is not used anymore.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Pavel Begunkov 2021-03-11 23:29:38 +00:00 committed by Jens Axboe
parent 26984fbf3a
commit 521d6a737a
1 changed files with 71 additions and 80 deletions

View File

@ -6665,6 +6665,7 @@ static int io_sq_thread(void *data)
up_read(&sqd->rw_lock);
cond_resched();
down_read(&sqd->rw_lock);
io_run_task_work();
timeout = jiffies + sqd->sq_thread_idle;
continue;
}
@ -6720,18 +6721,22 @@ static int io_sq_thread(void *data)
finish_wait(&sqd->wait, &wait);
timeout = jiffies + sqd->sq_thread_idle;
}
up_read(&sqd->rw_lock);
down_write(&sqd->rw_lock);
/*
* someone may have parked and added a cancellation task_work, run
* it first because we don't want it in io_uring_cancel_sqpoll()
*/
io_run_task_work();
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_uring_cancel_sqpoll(ctx);
up_read(&sqd->rw_lock);
io_run_task_work();
down_write(&sqd->rw_lock);
sqd->thread = NULL;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
up_write(&sqd->rw_lock);
io_run_task_work();
complete(&sqd->exited);
do_exit(0);
}
@ -7033,8 +7038,8 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
static void io_sq_thread_unpark(struct io_sq_data *sqd)
__releases(&sqd->rw_lock)
{
if (sqd->thread == current)
return;
WARN_ON_ONCE(sqd->thread == current);
clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
up_write(&sqd->rw_lock);
}
@ -7042,8 +7047,8 @@ static void io_sq_thread_unpark(struct io_sq_data *sqd)
static void io_sq_thread_park(struct io_sq_data *sqd)
__acquires(&sqd->rw_lock)
{
if (sqd->thread == current)
return;
WARN_ON_ONCE(sqd->thread == current);
set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
down_write(&sqd->rw_lock);
/* set again for consistency, in case concurrent parks are happening */
@ -7054,8 +7059,8 @@ static void io_sq_thread_park(struct io_sq_data *sqd)
static void io_sq_thread_stop(struct io_sq_data *sqd)
{
if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state))
return;
WARN_ON_ONCE(sqd->thread == current);
down_write(&sqd->rw_lock);
set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
if (sqd->thread)
@ -7078,7 +7083,7 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx)
if (sqd) {
io_sq_thread_park(sqd);
list_del(&ctx->sqd_list);
list_del_init(&ctx->sqd_list);
io_sqd_update_thread_idle(sqd);
io_sq_thread_unpark(sqd);
@ -7760,7 +7765,6 @@ static int io_uring_alloc_task_context(struct task_struct *task,
init_waitqueue_head(&tctx->wait);
tctx->last = NULL;
atomic_set(&tctx->in_idle, 0);
tctx->sqpoll = false;
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
@ -8719,45 +8723,14 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
io_uring_try_cancel_requests(ctx, task, files);
if (ctx->sq_data)
io_sq_thread_unpark(ctx->sq_data);
prepare_to_wait(&task->io_uring->wait, &wait,
TASK_UNINTERRUPTIBLE);
if (inflight == io_uring_count_inflight(ctx, task, files))
schedule();
finish_wait(&task->io_uring->wait, &wait);
if (ctx->sq_data)
io_sq_thread_park(ctx->sq_data);
}
}
/*
* We need to iteratively cancel requests, in case a request has dependent
* hard links. These persist even for failure of cancelations, hence keep
* looping until none are found.
*/
static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
struct files_struct *files)
{
struct task_struct *task = current;
if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
io_sq_thread_park(ctx->sq_data);
task = ctx->sq_data->thread;
if (task)
atomic_inc(&task->io_uring->in_idle);
}
io_uring_cancel_files(ctx, task, files);
if (!files)
io_uring_try_cancel_requests(ctx, task, NULL);
if (task)
atomic_dec(&task->io_uring->in_idle);
if (ctx->sq_data)
io_sq_thread_unpark(ctx->sq_data);
}
/*
* Note that this task has used io_uring. We use it for cancelation purposes.
*/
@ -8796,15 +8769,6 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx)
}
tctx->last = ctx;
}
/*
* This is race safe in that the task itself is doing this, hence it
* cannot be going through the exit/cancel paths at the same time.
* This cannot be modified while exit/cancel is running.
*/
if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL))
tctx->sqpoll = true;
return 0;
}
@ -8847,6 +8811,44 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
}
}
static s64 tctx_inflight(struct io_uring_task *tctx)
{
return percpu_counter_sum(&tctx->inflight);
}
static void io_sqpoll_cancel_cb(struct callback_head *cb)
{
struct io_tctx_exit *work = container_of(cb, struct io_tctx_exit, task_work);
struct io_ring_ctx *ctx = work->ctx;
struct io_sq_data *sqd = ctx->sq_data;
if (sqd->thread)
io_uring_cancel_sqpoll(ctx);
complete(&work->completion);
}
static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
{
struct io_sq_data *sqd = ctx->sq_data;
struct io_tctx_exit work = { .ctx = ctx, };
struct task_struct *task;
io_sq_thread_park(sqd);
list_del_init(&ctx->sqd_list);
io_sqd_update_thread_idle(sqd);
task = sqd->thread;
if (task) {
init_completion(&work.completion);
init_task_work(&work.task_work, io_sqpoll_cancel_cb);
WARN_ON_ONCE(task_work_add(task, &work.task_work, TWA_SIGNAL));
wake_up_process(task);
}
io_sq_thread_unpark(sqd);
if (task)
wait_for_completion(&work.completion);
}
void __io_uring_files_cancel(struct files_struct *files)
{
struct io_uring_task *tctx = current->io_uring;
@ -8855,41 +8857,40 @@ void __io_uring_files_cancel(struct files_struct *files)
/* make sure overflow events are dropped */
atomic_inc(&tctx->in_idle);
xa_for_each(&tctx->xa, index, node)
io_uring_cancel_task_requests(node->ctx, files);
xa_for_each(&tctx->xa, index, node) {
struct io_ring_ctx *ctx = node->ctx;
if (ctx->sq_data) {
io_sqpoll_cancel_sync(ctx);
continue;
}
io_uring_cancel_files(ctx, current, files);
if (!files)
io_uring_try_cancel_requests(ctx, current, NULL);
}
atomic_dec(&tctx->in_idle);
if (files)
io_uring_clean_tctx(tctx);
}
static s64 tctx_inflight(struct io_uring_task *tctx)
{
return percpu_counter_sum(&tctx->inflight);
}
/* should only be called by SQPOLL task */
static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
{
struct io_sq_data *sqd = ctx->sq_data;
struct io_uring_task *tctx;
struct io_uring_task *tctx = current->io_uring;
s64 inflight;
DEFINE_WAIT(wait);
if (!sqd)
return;
io_sq_thread_park(sqd);
if (!sqd->thread || !sqd->thread->io_uring) {
io_sq_thread_unpark(sqd);
return;
}
tctx = ctx->sq_data->thread->io_uring;
WARN_ON_ONCE(!sqd || ctx->sq_data->thread != current);
atomic_inc(&tctx->in_idle);
do {
/* read completions before cancelations */
inflight = tctx_inflight(tctx);
if (!inflight)
break;
io_uring_cancel_task_requests(ctx, NULL);
io_uring_try_cancel_requests(ctx, current, NULL);
prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
/*
@ -8902,7 +8903,6 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
finish_wait(&tctx->wait, &wait);
} while (1);
atomic_dec(&tctx->in_idle);
io_sq_thread_unpark(sqd);
}
/*
@ -8917,15 +8917,6 @@ void __io_uring_task_cancel(void)
/* make sure overflow events are dropped */
atomic_inc(&tctx->in_idle);
if (tctx->sqpoll) {
struct io_tctx_node *node;
unsigned long index;
xa_for_each(&tctx->xa, index, node)
io_uring_cancel_sqpoll(node->ctx);
}
do {
/* read completions before cancelations */
inflight = tctx_inflight(tctx);