From a146468d76e0462393a3e15b77b8b3ede60e2d06 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Dec 2020 20:57:27 -0700 Subject: [PATCH 01/17] io_uring: break links on shutdown failure Ensure that the return value of __sys_shutdown_sock() is used to potentially break links to the request, if we fail. Fixes: 36f4fa6886a8 ("io_uring: add support for shutdown(2)") Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6f9392c35eef..6a4560c9ed9a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3784,6 +3784,8 @@ static int io_shutdown(struct io_kiocb *req, bool force_nonblock) return -ENOTSOCK; ret = __sys_shutdown_sock(sock, req->shutdown.how); + if (ret < 0) + req_set_fail_links(req); io_req_complete(req, ret); return 0; #else From 4bc4a912534a72f1c96f483448f0be16e5a48063 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Dec 2020 07:53:33 -0700 Subject: [PATCH 02/17] io_uring: hold mmap_sem for mm->locked_vm manipulation The kernel doesn't seem to have clear rules around this, but various spots are using the mmap_sem to serialize access to modifying the locked_vm count. Play it safe and lock the mm for write when accounting or unaccounting locked memory. Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6a4560c9ed9a..2d07d35e7262 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8157,10 +8157,13 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, __io_unaccount_mem(ctx->user, nr_pages); if (ctx->mm_account) { - if (acct == ACCT_LOCKED) + if (acct == ACCT_LOCKED) { + mmap_write_lock(ctx->mm_account); ctx->mm_account->locked_vm -= nr_pages; - else if (acct == ACCT_PINNED) + mmap_write_unlock(ctx->mm_account); + }else if (acct == ACCT_PINNED) { atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); + } } } @@ -8176,10 +8179,13 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, } if (ctx->mm_account) { - if (acct == ACCT_LOCKED) + if (acct == ACCT_LOCKED) { + mmap_write_lock(ctx->mm_account); ctx->mm_account->locked_vm += nr_pages; - else if (acct == ACCT_PINNED) + mmap_write_unlock(ctx->mm_account); + } else if (acct == ACCT_PINNED) { atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); + } } return 0; From cda286f0715c82f8117e166afd42cca068876dde Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Dec 2020 00:24:35 +0000 Subject: [PATCH 03/17] io_uring: cancel reqs shouldn't kill overflow list io_uring_cancel_task_requests() doesn't imply that the ring is going away, it may continue to work well after that. The problem is that it sets ->cq_overflow_flushed effectively disabling the CQ overflow feature Split setting cq_overflow_flushed from flush, and do the first one only on exit. It's ok in terms of cancellations because there is a io_uring->in_idle check in __io_cqring_fill_event(). It also fixes a race with setting ->cq_overflow_flushed in io_uring_cancel_task_requests, whuch's is not atomic and a part of a bitmask with other flags. Though, the only other flag that's not set during init is drain_next, so it's not as bad for sane architectures. Signed-off-by: Pavel Begunkov Fixes: 0f2122045b946 ("io_uring: don't rely on weak ->files references") Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2d07d35e7262..0d0217281ccf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1733,10 +1733,6 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, spin_lock_irqsave(&ctx->completion_lock, flags); - /* if force is set, the ring is going away. always drop after that */ - if (force) - ctx->cq_overflow_flushed = 1; - cqe = NULL; list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { if (!io_match_task(req, tsk, files)) @@ -8655,6 +8651,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); + /* if force is set, the ring is going away. always drop after that */ + ctx->cq_overflow_flushed = 1; if (ctx->rings) io_cqring_overflow_flush(ctx, true, NULL, NULL); mutex_unlock(&ctx->uring_lock); From 9cd2be519d05ee78876d55e8e902b7125f78b74f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Dec 2020 00:24:36 +0000 Subject: [PATCH 04/17] io_uring: remove racy overflow list fast checks list_empty_careful() is not racy only if some conditions are met, i.e. no re-adds after del_init. io_cqring_overflow_flush() does list_move(), so it's actually racy. Remove those checks, we have ->cq_check_overflow for the fast path. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0d0217281ccf..fa3cf564ec06 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1724,8 +1724,6 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, LIST_HEAD(list); if (!force) { - if (list_empty_careful(&ctx->cq_overflow_list)) - return true; if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)) return false; @@ -6822,8 +6820,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) /* if we have a backlog and couldn't flush it all, return BUSY */ if (test_bit(0, &ctx->sq_check_overflow)) { - if (!list_empty(&ctx->cq_overflow_list) && - !io_cqring_overflow_flush(ctx, false, NULL, NULL)) + if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) return -EBUSY; } From e23de15fdbd3070446b2d212373c0ae556f63d93 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Dec 2020 00:24:37 +0000 Subject: [PATCH 05/17] io_uring: consolidate CQ nr events calculation Add a helper which calculates number of events in CQ. Handcoded version of it in io_cqring_overflow_flush() is not the clearest thing, so it makes it slightly more readable. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index fa3cf564ec06..56ce97fc9c02 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1693,6 +1693,11 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) return io_wq_current_is_worker(); } +static inline unsigned __io_cqring_events(struct io_ring_ctx *ctx) +{ + return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); +} + static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) @@ -1723,14 +1728,10 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, unsigned long flags; LIST_HEAD(list); - if (!force) { - if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) == - rings->cq_ring_entries)) - return false; - } + if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries) + return false; spin_lock_irqsave(&ctx->completion_lock, flags); - cqe = NULL; list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { if (!io_match_task(req, tsk, files)) @@ -2314,8 +2315,6 @@ static void io_double_put_req(struct io_kiocb *req) static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) { - struct io_rings *rings = ctx->rings; - if (test_bit(0, &ctx->cq_check_overflow)) { /* * noflush == true is from the waitqueue handler, just ensure @@ -2330,7 +2329,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) /* See comment at the top of this file */ smp_rmb(); - return ctx->cached_cq_tail - READ_ONCE(rings->cq.head); + return __io_cqring_events(ctx); } static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) From 09e88404f46cc32237f596c66f48a826294e08f2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Dec 2020 00:24:38 +0000 Subject: [PATCH 06/17] io_uring: inline io_cqring_mark_overflow() There is only one user of it and the name is misleading, get rid of it by inlining. By the way make overflow_flush's return value deduction simpler. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 56ce97fc9c02..5c9aa8b5e077 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1708,15 +1708,6 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) eventfd_signal(ctx->cq_ev_fd, 1); } -static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) -{ - if (list_empty(&ctx->cq_overflow_list)) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); - ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; - } -} - /* Returns true if there are no backlogged entries after the flush */ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, struct task_struct *tsk, @@ -1726,13 +1717,13 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, struct io_kiocb *req, *tmp; struct io_uring_cqe *cqe; unsigned long flags; + bool all_flushed; LIST_HEAD(list); if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries) return false; spin_lock_irqsave(&ctx->completion_lock, flags); - cqe = NULL; list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { if (!io_match_task(req, tsk, files)) continue; @@ -1753,9 +1744,14 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, } } - io_commit_cqring(ctx); - io_cqring_mark_overflow(ctx); + all_flushed = list_empty(&ctx->cq_overflow_list); + if (all_flushed) { + clear_bit(0, &ctx->sq_check_overflow); + clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; + } + io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); @@ -1765,7 +1761,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, io_put_req(req); } - return cqe != NULL; + return all_flushed; } static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) From 89448c47b8452b67c146dc6cad6f737e004c5caf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Dec 2020 00:24:39 +0000 Subject: [PATCH 07/17] io_uring: limit {io|sq}poll submit locking scope We don't need to take uring_lock for SQPOLL|IOPOLL to do io_cqring_overflow_flush() when cq_overflow_list is empty, remove it from the hot path. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5c9aa8b5e077..8cf6f22afc5e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9154,10 +9154,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { - io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); - if (!list_empty_careful(&ctx->cq_overflow_list)) + if (!list_empty_careful(&ctx->cq_overflow_list)) { + bool needs_lock = ctx->flags & IORING_SETUP_IOPOLL; + + io_ring_submit_lock(ctx, needs_lock); io_cqring_overflow_flush(ctx, false, NULL, NULL); - io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); + io_ring_submit_unlock(ctx, needs_lock); + } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) From 0020ef04e48571a88d4f482ad08f71052c5c5a08 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Fri, 18 Dec 2020 15:26:48 +0800 Subject: [PATCH 08/17] io_uring: fix io_wqe->work_list corruption For the first time a req punted to io-wq, we'll initialize io_wq_work's list to be NULL, then insert req to io_wqe->work_list. If this req is not inserted into tail of io_wqe->work_list, this req's io_wq_work list will point to another req's io_wq_work. For splitted bio case, this req maybe inserted to io_wqe->work_list repeatedly, once we insert it to tail of io_wqe->work_list for the second time, now io_wq_work->list->next will be invalid pointer, which then result in many strang error, panic, kernel soft-lockup, rcu stall, etc. In my vm, kernel doest not have commit cc29e1bf0d63f7 ("block: disable iopoll for split bio"), below fio job can reproduce this bug steadily: [global] name=iouring-sqpoll-iopoll-1 ioengine=io_uring iodepth=128 numjobs=1 thread rw=randread direct=1 registerfiles=1 hipri=1 bs=4m size=100M runtime=120 time_based group_reporting randrepeat=0 [device] directory=/home/feiman.wxg/mntpoint/ # an ext4 mount point If we have commit cc29e1bf0d63f7 ("block: disable iopoll for split bio"), there will no splitted bio case for polled io, but I think we still to need to fix this list corruption, it also should maybe go to stable branchs. To fix this corruption, if a req is inserted into tail of io_wqe->work_list, initialize req->io_wq_work->list->next to bu NULL. Cc: stable@vger.kernel.org Signed-off-by: Xiaoguang Wang Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io-wq.h b/fs/io-wq.h index 069496c6d4f9..75113bcd5889 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -59,6 +59,7 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, list->last->next = node; list->last = node; } + node->next = NULL; } static inline void wq_list_cut(struct io_wq_work_list *list, From dfea9fce29fda6f2f91161677e0e0d9b671bc099 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 18 Dec 2020 13:12:21 +0000 Subject: [PATCH 09/17] io_uring: close a small race gap for files cancel The purpose of io_uring_cancel_files() is to wait for all requests matching ->files to go/be cancelled. We should first drop files of a request in io_req_drop_files() and only then make it undiscoverable for io_uring_cancel_files. First drop, then delete from list. It's ok to leave req->id->files dangling, because it's not dereferenced by cancellation code, only compared against. It would potentially go to sleep and be awaken by following in io_req_drop_files() wake_up(). Fixes: 0f2122045b946 ("io_uring: don't rely on weak ->files references") Cc: # 5.5+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8cf6f22afc5e..b74957856e68 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6098,15 +6098,15 @@ static void io_req_drop_files(struct io_kiocb *req) struct io_uring_task *tctx = req->task->io_uring; unsigned long flags; - spin_lock_irqsave(&ctx->inflight_lock, flags); - list_del(&req->inflight_entry); - if (atomic_read(&tctx->in_idle)) - wake_up(&tctx->wait); - spin_unlock_irqrestore(&ctx->inflight_lock, flags); - req->flags &= ~REQ_F_INFLIGHT; put_files_struct(req->work.identity->files); put_nsproxy(req->work.identity->nsproxy); + spin_lock_irqsave(&ctx->inflight_lock, flags); + list_del(&req->inflight_entry); + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + req->flags &= ~REQ_F_INFLIGHT; req->work.flags &= ~IO_WQ_WORK_FILES; + if (atomic_read(&tctx->in_idle)) + wake_up(&tctx->wait); } static void __io_clean_op(struct io_kiocb *req) From dd20166236953c8cd14f4c668bf972af32f0c6be Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 19 Dec 2020 03:15:43 +0000 Subject: [PATCH 10/17] io_uring: fix 0-iov read buffer select Doing vectored buf-select read with 0 iovec passed is meaningless and utterly broken, forbid it. Cc: # 5.7+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b74957856e68..f3690dfdd564 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3125,9 +3125,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, iov[0].iov_len = kbuf->len; return 0; } - if (!req->rw.len) - return 0; - else if (req->rw.len > 1) + if (req->rw.len != 1) return -EINVAL; #ifdef CONFIG_COMPAT From 00c18640c2430c4bafaaeede1f9dd6f7ec0e4b25 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 20 Dec 2020 10:45:02 -0700 Subject: [PATCH 11/17] io_uring: make ctx cancel on exit targeted to actual ctx Before IORING_SETUP_ATTACH_WQ, we could just cancel everything on the io-wq when exiting. But that's not the case if they are shared, so cancel for the specific ctx instead. Cc: stable@vger.kernel.org Fixes: 24369c2e3bb0 ("io_uring: add io-wq workqueue sharing") Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f3690dfdd564..48b9332c2354 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8637,6 +8637,13 @@ static void io_ring_exit_work(struct work_struct *work) io_ring_ctx_free(ctx); } +static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + return req->ctx == data; +} + static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); @@ -8651,7 +8658,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_poll_remove_all(ctx, NULL, NULL); if (ctx->io_wq) - io_wq_cancel_all(ctx->io_wq); + io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true); /* if we failed setting up the ctx, we might not have any rings */ io_iopoll_try_reap_events(ctx); From 446bc1c207331080d8c711a4456799b7d0b9df26 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 20 Dec 2020 10:47:42 -0700 Subject: [PATCH 12/17] io-wq: kill now unused io_wq_cancel_all() io_uring no longer issues full cancelations on the io-wq, so remove any remnants of this code and the IO_WQ_BIT_CANCEL flag. Signed-off-by: Jens Axboe --- fs/io-wq.c | 30 +----------------------------- fs/io-wq.h | 2 -- 2 files changed, 1 insertion(+), 31 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index f72d53848dcb..a564f36e260c 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -36,8 +36,7 @@ enum { enum { IO_WQ_BIT_EXIT = 0, /* wq exiting */ - IO_WQ_BIT_CANCEL = 1, /* cancel work on list */ - IO_WQ_BIT_ERROR = 2, /* error on setup */ + IO_WQ_BIT_ERROR = 1, /* error on setup */ }; enum { @@ -561,12 +560,6 @@ get_next: next_hashed = wq_next_work(work); io_impersonate_work(worker, work); - /* - * OK to set IO_WQ_WORK_CANCEL even for uncancellable - * work, the worker function will do the right thing. - */ - if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) - work->flags |= IO_WQ_WORK_CANCEL; old_work = work; linked = wq->do_work(work); @@ -732,12 +725,6 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) return acct->nr_workers < acct->max_workers; } -static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) -{ - send_sig(SIGINT, worker->task, 1); - return false; -} - /* * Iterate the passed in list and call the specific function for each * worker that isn't exiting @@ -938,21 +925,6 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); } -void io_wq_cancel_all(struct io_wq *wq) -{ - int node; - - set_bit(IO_WQ_BIT_CANCEL, &wq->state); - - rcu_read_lock(); - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL); - } - rcu_read_unlock(); -} - struct io_cb_cancel_data { work_cancel_fn *fn; void *data; diff --git a/fs/io-wq.h b/fs/io-wq.h index 75113bcd5889..b158f8addcf3 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -129,8 +129,6 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work) return work->flags & IO_WQ_WORK_HASHED; } -void io_wq_cancel_all(struct io_wq *wq); - typedef bool (work_cancel_fn)(struct io_wq_work *, void *); enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, From 55583d72e2303638d30dd4a7aabef59ffa0a017a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Dec 2020 13:21:43 +0000 Subject: [PATCH 13/17] io_uring: always progress task_work on task cancel Might happen that __io_uring_cancel_task_requests() cancels nothing but there are task_works pending. We need to always run them. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 48b9332c2354..d0ab6b503a9f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8801,9 +8801,9 @@ static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, ret |= io_poll_remove_all(ctx, task, NULL); ret |= io_kill_timeouts(ctx, task, NULL); + ret |= io_run_task_work(); if (!ret) break; - io_run_task_work(); cond_resched(); } } From f57555eda979ca085d2524db81e14b8a6089e15e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Dec 2020 13:21:44 +0000 Subject: [PATCH 14/17] io_uring: end waiting before task cancel attempts Get rid of TASK_UNINTERRUPTIBLE and waiting with finish_wait before going for next iteration in __io_uring_task_cancel(), because __io_uring_files_cancel() doesn't expect that sheduling is disallowed. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d0ab6b503a9f..fbf747803dbc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8989,9 +8989,9 @@ void __io_uring_task_cancel(void) if (inflight != tctx_inflight(tctx)) continue; schedule(); + finish_wait(&tctx->wait, &wait); } while (1); - finish_wait(&tctx->wait, &wait); atomic_dec(&tctx->in_idle); } From a528b04ea40690ff40501f50d618a62a02b19620 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 21 Dec 2020 18:34:04 +0000 Subject: [PATCH 15/17] io_uring: fix ignoring xa_store errors xa_store() may fail, check the result. Cc: stable@vger.kernel.org # 5.10 Fixes: 0f2122045b946 ("io_uring: don't rely on weak ->files references") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index fbf747803dbc..846c635d0620 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8852,10 +8852,9 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) { struct io_uring_task *tctx = current->io_uring; + int ret; if (unlikely(!tctx)) { - int ret; - ret = io_uring_alloc_task_context(current); if (unlikely(ret)) return ret; @@ -8866,7 +8865,12 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) if (!old) { get_file(file); - xa_store(&tctx->xa, (unsigned long)file, file, GFP_KERNEL); + ret = xa_err(xa_store(&tctx->xa, (unsigned long)file, + file, GFP_KERNEL)); + if (ret) { + fput(file); + return ret; + } } tctx->last = file; } From 9faadcc8abe4b83d0263216dc3a6321d5bbd616b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 21 Dec 2020 18:34:05 +0000 Subject: [PATCH 16/17] io_uring: fix double io_uring free Once we created a file for current context during setup, we should not call io_ring_ctx_wait_and_kill() directly as it'll be done by fput(file) Cc: stable@vger.kernel.org # 5.10 Reported-by: syzbot+c9937dfb2303a5f18640@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov [axboe: fix unused 'ret' for !CONFIG_UNIX] Signed-off-by: Jens Axboe --- fs/io_uring.c | 73 ++++++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 846c635d0620..7c0b77d49cd2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9379,55 +9379,52 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, return 0; } +static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) +{ + int ret, fd; + + fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (fd < 0) + return fd; + + ret = io_uring_add_task_file(ctx, file); + if (ret) { + put_unused_fd(fd); + return ret; + } + fd_install(fd, file); + return fd; +} + /* * Allocate an anonymous fd, this is what constitutes the application * visible backing of an io_uring instance. The application mmaps this * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, * we have to tie this fd to a socket for file garbage collection purposes. */ -static int io_uring_get_fd(struct io_ring_ctx *ctx) +static struct file *io_uring_get_file(struct io_ring_ctx *ctx) { struct file *file; - int ret; - int fd; - #if defined(CONFIG_UNIX) + int ret; + ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, &ctx->ring_sock); if (ret) - return ret; + return ERR_PTR(ret); #endif - ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC); - if (ret < 0) - goto err; - fd = ret; - file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, O_RDWR | O_CLOEXEC); +#if defined(CONFIG_UNIX) if (IS_ERR(file)) { - put_unused_fd(fd); - ret = PTR_ERR(file); - goto err; + sock_release(ctx->ring_sock); + ctx->ring_sock = NULL; + } else { + ctx->ring_sock->file = file; } - -#if defined(CONFIG_UNIX) - ctx->ring_sock->file = file; #endif - ret = io_uring_add_task_file(ctx, file); - if (ret) { - fput(file); - put_unused_fd(fd); - goto err; - } - fd_install(fd, file); - return fd; -err: -#if defined(CONFIG_UNIX) - sock_release(ctx->ring_sock); - ctx->ring_sock = NULL; -#endif - return ret; + return file; } static int io_uring_create(unsigned entries, struct io_uring_params *p, @@ -9435,6 +9432,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, { struct user_struct *user = NULL; struct io_ring_ctx *ctx; + struct file *file; bool limit_mem; int ret; @@ -9582,13 +9580,22 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; } + file = io_uring_get_file(ctx); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto err; + } + /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup */ - ret = io_uring_get_fd(ctx); - if (ret < 0) - goto err; + ret = io_uring_install_fd(ctx, file); + if (ret < 0) { + /* fput will clean it up */ + fput(file); + return ret; + } trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; From c07e6719511e77c4b289f62bfe96423eb6ea061d Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Mon, 14 Dec 2020 23:49:41 +0800 Subject: [PATCH 17/17] io_uring: hold uring_lock while completing failed polled io in io_wq_submit_work() io_iopoll_complete() does not hold completion_lock to complete polled io, so in io_wq_submit_work(), we can not call io_req_complete() directly, to complete polled io, otherwise there maybe concurrent access to cqring, defer_list, etc, which is not safe. Commit dad1b1242fd5 ("io_uring: always let io_iopoll_complete() complete polled io") has fixed this issue, but Pavel reported that IOPOLL apart from rw can do buf reg/unreg requests( IORING_OP_PROVIDE_BUFFERS or IORING_OP_REMOVE_BUFFERS), so the fix is not good. Given that io_iopoll_complete() is always called under uring_lock, so here for polled io, we can also get uring_lock to fix this issue. Fixes: dad1b1242fd5 ("io_uring: always let io_iopoll_complete() complete polled io") Cc: # 5.5+ Signed-off-by: Xiaoguang Wang Reviewed-by: Pavel Begunkov [axboe: don't deref 'req' after completing it'] Signed-off-by: Jens Axboe --- fs/io_uring.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7c0b77d49cd2..7e35283fc0b1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6332,19 +6332,28 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) } if (ret) { - /* - * io_iopoll_complete() does not hold completion_lock to complete - * polled io, so here for polled io, just mark it done and still let - * io_iopoll_complete() complete it. - */ - if (req->ctx->flags & IORING_SETUP_IOPOLL) { - struct kiocb *kiocb = &req->rw.kiocb; + struct io_ring_ctx *lock_ctx = NULL; - kiocb_done(kiocb, ret, NULL); - } else { - req_set_fail_links(req); - io_req_complete(req, ret); - } + if (req->ctx->flags & IORING_SETUP_IOPOLL) + lock_ctx = req->ctx; + + /* + * io_iopoll_complete() does not hold completion_lock to + * complete polled io, so here for polled io, we can not call + * io_req_complete() directly, otherwise there maybe concurrent + * access to cqring, defer_list, etc, which is not safe. Given + * that io_iopoll_complete() is always called under uring_lock, + * so here for polled io, we also get uring_lock to complete + * it. + */ + if (lock_ctx) + mutex_lock(&lock_ctx->uring_lock); + + req_set_fail_links(req); + io_req_complete(req, ret); + + if (lock_ctx) + mutex_unlock(&lock_ctx->uring_lock); } return io_steal_work(req);