io_uring: cancel sqpoll via task_work
1) The first problem is io_uring_cancel_sqpoll() -> io_uring_cancel_task_requests() basically doing park(); park(); and so hanging. 2) Another one is more subtle, when the master task is doing cancellations, but SQPOLL task submits in-between the end of the cancellation but before finish() requests taking a ref to the ctx, and so eternally locking it up. 3) Yet another is a dying SQPOLL task doing io_uring_cancel_sqpoll() and same io_uring_cancel_sqpoll() from the owner task, they race for tctx->wait events. And there probably more of them. Instead do SQPOLL cancellations from within SQPOLL task context via task_work, see io_sqpoll_cancel_sync(). With that we don't need temporal park()/unpark() during cancellation, which is ugly, subtle and anyway doesn't allow to do io_run_task_work() properly. io_uring_cancel_sqpoll() is called only from SQPOLL task context and under sqd locking, so all parking is removed from there. And so, io_sq_thread_[un]park() and io_sq_thread_stop() are not used now by SQPOLL task, and that spare us from some headache. Also remove ctx->sqd_list early to avoid 2). And kill tctx->sqpoll, which is not used anymore. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
parent
26984fbf3a
commit
521d6a737a
151
fs/io_uring.c
151
fs/io_uring.c
|
@ -6665,6 +6665,7 @@ static int io_sq_thread(void *data)
|
|||
up_read(&sqd->rw_lock);
|
||||
cond_resched();
|
||||
down_read(&sqd->rw_lock);
|
||||
io_run_task_work();
|
||||
timeout = jiffies + sqd->sq_thread_idle;
|
||||
continue;
|
||||
}
|
||||
|
@ -6720,18 +6721,22 @@ static int io_sq_thread(void *data)
|
|||
finish_wait(&sqd->wait, &wait);
|
||||
timeout = jiffies + sqd->sq_thread_idle;
|
||||
}
|
||||
up_read(&sqd->rw_lock);
|
||||
down_write(&sqd->rw_lock);
|
||||
/*
|
||||
* someone may have parked and added a cancellation task_work, run
|
||||
* it first because we don't want it in io_uring_cancel_sqpoll()
|
||||
*/
|
||||
io_run_task_work();
|
||||
|
||||
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
|
||||
io_uring_cancel_sqpoll(ctx);
|
||||
up_read(&sqd->rw_lock);
|
||||
|
||||
io_run_task_work();
|
||||
|
||||
down_write(&sqd->rw_lock);
|
||||
sqd->thread = NULL;
|
||||
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
|
||||
io_ring_set_wakeup_flag(ctx);
|
||||
up_write(&sqd->rw_lock);
|
||||
|
||||
io_run_task_work();
|
||||
complete(&sqd->exited);
|
||||
do_exit(0);
|
||||
}
|
||||
|
@ -7033,8 +7038,8 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
|
|||
static void io_sq_thread_unpark(struct io_sq_data *sqd)
|
||||
__releases(&sqd->rw_lock)
|
||||
{
|
||||
if (sqd->thread == current)
|
||||
return;
|
||||
WARN_ON_ONCE(sqd->thread == current);
|
||||
|
||||
clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
|
||||
up_write(&sqd->rw_lock);
|
||||
}
|
||||
|
@ -7042,8 +7047,8 @@ static void io_sq_thread_unpark(struct io_sq_data *sqd)
|
|||
static void io_sq_thread_park(struct io_sq_data *sqd)
|
||||
__acquires(&sqd->rw_lock)
|
||||
{
|
||||
if (sqd->thread == current)
|
||||
return;
|
||||
WARN_ON_ONCE(sqd->thread == current);
|
||||
|
||||
set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
|
||||
down_write(&sqd->rw_lock);
|
||||
/* set again for consistency, in case concurrent parks are happening */
|
||||
|
@ -7054,8 +7059,8 @@ static void io_sq_thread_park(struct io_sq_data *sqd)
|
|||
|
||||
static void io_sq_thread_stop(struct io_sq_data *sqd)
|
||||
{
|
||||
if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state))
|
||||
return;
|
||||
WARN_ON_ONCE(sqd->thread == current);
|
||||
|
||||
down_write(&sqd->rw_lock);
|
||||
set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
|
||||
if (sqd->thread)
|
||||
|
@ -7078,7 +7083,7 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx)
|
|||
|
||||
if (sqd) {
|
||||
io_sq_thread_park(sqd);
|
||||
list_del(&ctx->sqd_list);
|
||||
list_del_init(&ctx->sqd_list);
|
||||
io_sqd_update_thread_idle(sqd);
|
||||
io_sq_thread_unpark(sqd);
|
||||
|
||||
|
@ -7760,7 +7765,6 @@ static int io_uring_alloc_task_context(struct task_struct *task,
|
|||
init_waitqueue_head(&tctx->wait);
|
||||
tctx->last = NULL;
|
||||
atomic_set(&tctx->in_idle, 0);
|
||||
tctx->sqpoll = false;
|
||||
task->io_uring = tctx;
|
||||
spin_lock_init(&tctx->task_lock);
|
||||
INIT_WQ_LIST(&tctx->task_list);
|
||||
|
@ -8719,45 +8723,14 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
|
|||
|
||||
io_uring_try_cancel_requests(ctx, task, files);
|
||||
|
||||
if (ctx->sq_data)
|
||||
io_sq_thread_unpark(ctx->sq_data);
|
||||
prepare_to_wait(&task->io_uring->wait, &wait,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
if (inflight == io_uring_count_inflight(ctx, task, files))
|
||||
schedule();
|
||||
finish_wait(&task->io_uring->wait, &wait);
|
||||
if (ctx->sq_data)
|
||||
io_sq_thread_park(ctx->sq_data);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We need to iteratively cancel requests, in case a request has dependent
|
||||
* hard links. These persist even for failure of cancelations, hence keep
|
||||
* looping until none are found.
|
||||
*/
|
||||
static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
|
||||
struct files_struct *files)
|
||||
{
|
||||
struct task_struct *task = current;
|
||||
|
||||
if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
|
||||
io_sq_thread_park(ctx->sq_data);
|
||||
task = ctx->sq_data->thread;
|
||||
if (task)
|
||||
atomic_inc(&task->io_uring->in_idle);
|
||||
}
|
||||
|
||||
io_uring_cancel_files(ctx, task, files);
|
||||
if (!files)
|
||||
io_uring_try_cancel_requests(ctx, task, NULL);
|
||||
|
||||
if (task)
|
||||
atomic_dec(&task->io_uring->in_idle);
|
||||
if (ctx->sq_data)
|
||||
io_sq_thread_unpark(ctx->sq_data);
|
||||
}
|
||||
|
||||
/*
|
||||
* Note that this task has used io_uring. We use it for cancelation purposes.
|
||||
*/
|
||||
|
@ -8796,15 +8769,6 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx)
|
|||
}
|
||||
tctx->last = ctx;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is race safe in that the task itself is doing this, hence it
|
||||
* cannot be going through the exit/cancel paths at the same time.
|
||||
* This cannot be modified while exit/cancel is running.
|
||||
*/
|
||||
if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL))
|
||||
tctx->sqpoll = true;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -8847,6 +8811,44 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
|
|||
}
|
||||
}
|
||||
|
||||
static s64 tctx_inflight(struct io_uring_task *tctx)
|
||||
{
|
||||
return percpu_counter_sum(&tctx->inflight);
|
||||
}
|
||||
|
||||
static void io_sqpoll_cancel_cb(struct callback_head *cb)
|
||||
{
|
||||
struct io_tctx_exit *work = container_of(cb, struct io_tctx_exit, task_work);
|
||||
struct io_ring_ctx *ctx = work->ctx;
|
||||
struct io_sq_data *sqd = ctx->sq_data;
|
||||
|
||||
if (sqd->thread)
|
||||
io_uring_cancel_sqpoll(ctx);
|
||||
complete(&work->completion);
|
||||
}
|
||||
|
||||
static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_sq_data *sqd = ctx->sq_data;
|
||||
struct io_tctx_exit work = { .ctx = ctx, };
|
||||
struct task_struct *task;
|
||||
|
||||
io_sq_thread_park(sqd);
|
||||
list_del_init(&ctx->sqd_list);
|
||||
io_sqd_update_thread_idle(sqd);
|
||||
task = sqd->thread;
|
||||
if (task) {
|
||||
init_completion(&work.completion);
|
||||
init_task_work(&work.task_work, io_sqpoll_cancel_cb);
|
||||
WARN_ON_ONCE(task_work_add(task, &work.task_work, TWA_SIGNAL));
|
||||
wake_up_process(task);
|
||||
}
|
||||
io_sq_thread_unpark(sqd);
|
||||
|
||||
if (task)
|
||||
wait_for_completion(&work.completion);
|
||||
}
|
||||
|
||||
void __io_uring_files_cancel(struct files_struct *files)
|
||||
{
|
||||
struct io_uring_task *tctx = current->io_uring;
|
||||
|
@ -8855,41 +8857,40 @@ void __io_uring_files_cancel(struct files_struct *files)
|
|||
|
||||
/* make sure overflow events are dropped */
|
||||
atomic_inc(&tctx->in_idle);
|
||||
xa_for_each(&tctx->xa, index, node)
|
||||
io_uring_cancel_task_requests(node->ctx, files);
|
||||
xa_for_each(&tctx->xa, index, node) {
|
||||
struct io_ring_ctx *ctx = node->ctx;
|
||||
|
||||
if (ctx->sq_data) {
|
||||
io_sqpoll_cancel_sync(ctx);
|
||||
continue;
|
||||
}
|
||||
io_uring_cancel_files(ctx, current, files);
|
||||
if (!files)
|
||||
io_uring_try_cancel_requests(ctx, current, NULL);
|
||||
}
|
||||
atomic_dec(&tctx->in_idle);
|
||||
|
||||
if (files)
|
||||
io_uring_clean_tctx(tctx);
|
||||
}
|
||||
|
||||
static s64 tctx_inflight(struct io_uring_task *tctx)
|
||||
{
|
||||
return percpu_counter_sum(&tctx->inflight);
|
||||
}
|
||||
|
||||
/* should only be called by SQPOLL task */
|
||||
static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_sq_data *sqd = ctx->sq_data;
|
||||
struct io_uring_task *tctx;
|
||||
struct io_uring_task *tctx = current->io_uring;
|
||||
s64 inflight;
|
||||
DEFINE_WAIT(wait);
|
||||
|
||||
if (!sqd)
|
||||
return;
|
||||
io_sq_thread_park(sqd);
|
||||
if (!sqd->thread || !sqd->thread->io_uring) {
|
||||
io_sq_thread_unpark(sqd);
|
||||
return;
|
||||
}
|
||||
tctx = ctx->sq_data->thread->io_uring;
|
||||
WARN_ON_ONCE(!sqd || ctx->sq_data->thread != current);
|
||||
|
||||
atomic_inc(&tctx->in_idle);
|
||||
do {
|
||||
/* read completions before cancelations */
|
||||
inflight = tctx_inflight(tctx);
|
||||
if (!inflight)
|
||||
break;
|
||||
io_uring_cancel_task_requests(ctx, NULL);
|
||||
io_uring_try_cancel_requests(ctx, current, NULL);
|
||||
|
||||
prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
|
||||
/*
|
||||
|
@ -8902,7 +8903,6 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
|
|||
finish_wait(&tctx->wait, &wait);
|
||||
} while (1);
|
||||
atomic_dec(&tctx->in_idle);
|
||||
io_sq_thread_unpark(sqd);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -8917,15 +8917,6 @@ void __io_uring_task_cancel(void)
|
|||
|
||||
/* make sure overflow events are dropped */
|
||||
atomic_inc(&tctx->in_idle);
|
||||
|
||||
if (tctx->sqpoll) {
|
||||
struct io_tctx_node *node;
|
||||
unsigned long index;
|
||||
|
||||
xa_for_each(&tctx->xa, index, node)
|
||||
io_uring_cancel_sqpoll(node->ctx);
|
||||
}
|
||||
|
||||
do {
|
||||
/* read completions before cancelations */
|
||||
inflight = tctx_inflight(tctx);
|
||||
|
|
Loading…
Reference in New Issue