From ce59fc69b1c2da555706f6b0e77fc099f80e9d0e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 2 Sep 2020 13:28:09 -0600 Subject: [PATCH 01/43] io_uring: allow SQPOLL with CAP_SYS_NICE privileges CAP_SYS_ADMIN is too restrictive for a lot of uses cases, allow CAP_SYS_NICE based on the premise that such users are already allowed to raise the priority of tasks. Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a8c136a1cf4e..3cc1e59dd789 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7783,7 +7783,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_sq_data *sqd; ret = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) goto err; sqd = io_get_sq_data(p); From b713c195d59332277a31a59c91f755e53b5b302b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 5 Sep 2020 11:13:35 -0600 Subject: [PATCH 02/43] net: provide __sys_shutdown_sock() that takes a socket No functional changes in this patch, needed to provide io_uring support for shutdown(2). Cc: netdev@vger.kernel.org Cc: David S. Miller Acked-by: Jakub Kicinski Signed-off-by: Jens Axboe --- include/linux/socket.h | 1 + net/socket.c | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index e9cb30d8cbfb..385894b4a8bb 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -436,6 +436,7 @@ extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_socketpair(int family, int type, int protocol, int __user *usockvec); +extern int __sys_shutdown_sock(struct socket *sock, int how); extern int __sys_shutdown(int fd, int how); extern struct ns_common *get_net_ns(struct ns_common *ns); diff --git a/net/socket.c b/net/socket.c index 6e6cccc2104f..4b615c719765 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2192,6 +2192,17 @@ SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, * Shutdown a socket. */ +int __sys_shutdown_sock(struct socket *sock, int how) +{ + int err; + + err = security_socket_shutdown(sock, how); + if (!err) + err = sock->ops->shutdown(sock, how); + + return err; +} + int __sys_shutdown(int fd, int how) { int err, fput_needed; @@ -2199,9 +2210,7 @@ int __sys_shutdown(int fd, int how) sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { - err = security_socket_shutdown(sock, how); - if (!err) - err = sock->ops->shutdown(sock, how); + err = __sys_shutdown_sock(sock, how); fput_light(sock->file, fput_needed); } return err; From 36f4fa6886a81266d7c82b1c90a65205e73a7c85 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 5 Sep 2020 11:14:22 -0600 Subject: [PATCH 03/43] io_uring: add support for shutdown(2) This adds support for the shutdown(2) system call, which is useful for dealing with sockets. shutdown(2) may block, so we have to punt it to async context. Suggested-by: Norman Maurer Signed-off-by: Jens Axboe --- fs/io_uring.c | 52 +++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 1 + 2 files changed, 53 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3cc1e59dd789..d17198733f6a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -541,6 +541,11 @@ struct io_statx { struct statx __user *buffer; }; +struct io_shutdown { + struct file *file; + int how; +}; + struct io_completion { struct file *file; struct list_head list; @@ -667,6 +672,7 @@ struct io_kiocb { struct io_splice splice; struct io_provide_buf pbuf; struct io_statx statx; + struct io_shutdown shutdown; /* use only after cleaning per-op data, see io_clean_op() */ struct io_completion compl; }; @@ -934,6 +940,9 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, }, + [IORING_OP_SHUTDOWN] = { + .needs_file = 1, + }, }; enum io_mem_account { @@ -3591,6 +3600,44 @@ out_free: return ret; } +static int io_shutdown_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_NET) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || + sqe->buf_index) + return -EINVAL; + + req->shutdown.how = READ_ONCE(sqe->len); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_shutdown(struct io_kiocb *req, bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct socket *sock; + int ret; + + if (force_nonblock) + return -EAGAIN; + + sock = sock_from_file(req->file, &ret); + if (unlikely(!sock)) + return ret; + + ret = __sys_shutdown_sock(sock, req->shutdown.how); + io_req_complete(req, ret); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static int __io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -5775,6 +5822,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_remove_buffers_prep(req, sqe); case IORING_OP_TEE: return io_tee_prep(req, sqe); + case IORING_OP_SHUTDOWN: + return io_shutdown_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6018,6 +6067,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, case IORING_OP_TEE: ret = io_tee(req, force_nonblock); break; + case IORING_OP_SHUTDOWN: + ret = io_shutdown(req, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 98d8e06dea22..e943bf07c959 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -132,6 +132,7 @@ enum { IORING_OP_PROVIDE_BUFFERS, IORING_OP_REMOVE_BUFFERS, IORING_OP_TEE, + IORING_OP_SHUTDOWN, /* this goes last, obviously */ IORING_OP_LAST, From 28cea78af44918b920306df150afbd116bd94301 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Sep 2020 10:51:17 -0600 Subject: [PATCH 04/43] io_uring: allow non-fixed files with SQPOLL The restriction of needing fixed files for SQPOLL is problematic, and prevents/inhibits several valid uses cases. With the referenced files_struct that we have now, it's trivially supportable. Treat ->files like we do the mm for the SQPOLL thread - grab a reference to it (and assign it), and drop it when we're done. This feature is exposed as IORING_FEAT_SQPOLL_NONFIXED. Signed-off-by: Jens Axboe --- fs/io_uring.c | 87 +++++++++++++++++++++++++++-------- include/uapi/linux/io_uring.h | 1 + 2 files changed, 70 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d17198733f6a..c1f3980945e4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -999,8 +999,9 @@ static inline void io_clean_op(struct io_kiocb *req) __io_clean_op(req); } -static void io_sq_thread_drop_mm(void) +static void io_sq_thread_drop_mm_files(void) { + struct files_struct *files = current->files; struct mm_struct *mm = current->mm; if (mm) { @@ -1008,6 +1009,40 @@ static void io_sq_thread_drop_mm(void) mmput(mm); current->mm = NULL; } + if (files) { + struct nsproxy *nsproxy = current->nsproxy; + + task_lock(current); + current->files = NULL; + current->nsproxy = NULL; + task_unlock(current); + put_files_struct(files); + put_nsproxy(nsproxy); + } +} + +static void __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) +{ + if (!current->files) { + struct files_struct *files; + struct nsproxy *nsproxy; + + task_lock(ctx->sqo_task); + files = ctx->sqo_task->files; + if (!files) { + task_unlock(ctx->sqo_task); + return; + } + atomic_inc(&files->count); + get_nsproxy(ctx->sqo_task->nsproxy); + nsproxy = ctx->sqo_task->nsproxy; + task_unlock(ctx->sqo_task); + + task_lock(current); + current->files = files; + current->nsproxy = nsproxy; + task_unlock(current); + } } static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) @@ -1035,12 +1070,21 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) return -EFAULT; } -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, + struct io_kiocb *req) { - if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM)) - return 0; - return __io_sq_thread_acquire_mm(ctx); + const struct io_op_def *def = &io_op_defs[req->opcode]; + + if (def->work_flags & IO_WQ_WORK_MM) { + int ret = __io_sq_thread_acquire_mm(ctx); + if (unlikely(ret)) + return ret; + } + + if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) + __io_sq_thread_acquire_files(ctx); + + return 0; } static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, @@ -2061,6 +2105,7 @@ static void __io_req_task_submit(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; if (!__io_sq_thread_acquire_mm(ctx)) { + __io_sq_thread_acquire_files(ctx); mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL); mutex_unlock(&ctx->uring_lock); @@ -2603,7 +2648,7 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) return false; - ret = io_sq_thread_acquire_mm(req->ctx, req); + ret = io_sq_thread_acquire_mm_files(req->ctx, req); if (io_resubmit_prep(req, ret)) { refcount_inc(&req->refs); @@ -6168,13 +6213,7 @@ static struct file *io_file_get(struct io_submit_state *state, static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, int fd) { - bool fixed; - - fixed = (req->flags & REQ_F_FIXED_FILE) != 0; - if (unlikely(!fixed && io_async_submit(req->ctx))) - return -EBADF; - - req->file = io_file_get(state, req, fd, fixed); + req->file = io_file_get(state, req, fd, req->flags & REQ_F_FIXED_FILE); if (req->file || io_op_defs[req->opcode].needs_file_no_error) return 0; return -EBADF; @@ -6551,7 +6590,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(req->opcode >= IORING_OP_LAST)) return -EINVAL; - if (unlikely(io_sq_thread_acquire_mm(ctx, req))) + if (unlikely(io_sq_thread_acquire_mm_files(ctx, req))) return -EFAULT; sqe_flags = READ_ONCE(sqe->flags); @@ -6739,7 +6778,7 @@ again: * adding ourselves to the waitqueue, as the unuse/drop * may sleep. */ - io_sq_thread_drop_mm(); + io_sq_thread_drop_mm_files(); /* * We're polling. If we're within the defined idle @@ -6808,11 +6847,18 @@ static void io_sqd_init_new(struct io_sq_data *sqd) static int io_sq_thread(void *data) { struct cgroup_subsys_state *cur_css = NULL; + struct files_struct *old_files = current->files; + struct nsproxy *old_nsproxy = current->nsproxy; const struct cred *old_cred = NULL; struct io_sq_data *sqd = data; struct io_ring_ctx *ctx; unsigned long start_jiffies; + task_lock(current); + current->files = NULL; + current->nsproxy = NULL; + task_unlock(current); + start_jiffies = jiffies; while (!kthread_should_stop()) { enum sq_ret ret = 0; @@ -6845,7 +6891,7 @@ static int io_sq_thread(void *data) ret |= __io_sq_thread(ctx, start_jiffies, cap_entries); - io_sq_thread_drop_mm(); + io_sq_thread_drop_mm_files(); } if (ret & SQT_SPIN) { @@ -6870,6 +6916,11 @@ static int io_sq_thread(void *data) if (old_cred) revert_creds(old_cred); + task_lock(current); + current->files = old_files; + current->nsproxy = old_nsproxy; + task_unlock(current); + kthread_parkme(); return 0; @@ -9415,7 +9466,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | - IORING_FEAT_POLL_32BITS; + IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e943bf07c959..2301c37e86cb 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -254,6 +254,7 @@ struct io_uring_params { #define IORING_FEAT_CUR_PERSONALITY (1U << 4) #define IORING_FEAT_FAST_POLL (1U << 5) #define IORING_FEAT_POLL_32BITS (1U << 6) +#define IORING_FEAT_SQPOLL_NONFIXED (1U << 7) /* * io_uring_register(2) opcodes and arguments From 14587a46646d30d2b4a6b69865682cfe6bbdcd1f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 5 Sep 2020 11:36:08 -0600 Subject: [PATCH 05/43] io_uring: enable file table usage for SQPOLL rings Now that SQPOLL supports non-registered files and grabs the file table, we can relax the restriction on open/close/accept/connect and allow them on a ring that is setup with IORING_SETUP_SQPOLL. Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c1f3980945e4..1824bd4329ee 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -869,7 +869,7 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_OPENAT] = { .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS, + IO_WQ_WORK_FS | IO_WQ_WORK_MM, }, [IORING_OP_CLOSE] = { .needs_file = 1, @@ -921,7 +921,7 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_OPENAT2] = { .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS | - IO_WQ_WORK_BLKCG, + IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, @@ -3897,7 +3897,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { u64 flags, mode; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; mode = READ_ONCE(sqe->len); flags = READ_ONCE(sqe->open_flags); @@ -3911,7 +3911,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) size_t len; int ret; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); len = READ_ONCE(sqe->len); @@ -4305,7 +4305,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) io_req_init_async(req); req->work.flags |= IO_WQ_WORK_NO_CANCEL; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) @@ -4786,7 +4786,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = &req->accept; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL; @@ -4827,7 +4827,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_connect *conn = &req->connect; struct io_async_connect *io = req->async_data; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) return -EINVAL; From e886663cfd029b64a1d8da7efae7014526d884e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 26 Sep 2020 17:20:17 -0600 Subject: [PATCH 06/43] fs: make do_renameat2() take struct filename Pass in the struct filename pointers instead of the user string, and update the three callers to do the same. This behaves like do_unlinkat(), which also takes a filename struct and puts it when it is done. Converting callers is then trivial. Signed-off-by: Jens Axboe --- fs/internal.h | 2 ++ fs/namei.c | 40 ++++++++++++++++++++++------------------ 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index a7cd0f64faa4..6fd14ea213c3 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -78,6 +78,8 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, long do_rmdir(int dfd, struct filename *name); long do_unlinkat(int dfd, struct filename *name); int may_linkat(struct path *link); +int do_renameat2(int olddfd, struct filename *oldname, int newdfd, + struct filename *newname, unsigned int flags); /* * namespace.c diff --git a/fs/namei.c b/fs/namei.c index d4a6dd772303..03d0e11e4f36 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4346,8 +4346,8 @@ out: } EXPORT_SYMBOL(vfs_rename); -static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, - const char __user *newname, unsigned int flags) +int do_renameat2(int olddfd, struct filename *from, int newdfd, + struct filename *to, unsigned int flags) { struct dentry *old_dentry, *new_dentry; struct dentry *trap; @@ -4355,32 +4355,30 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, struct qstr old_last, new_last; int old_type, new_type; struct inode *delegated_inode = NULL; - struct filename *from; - struct filename *to; unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; bool should_retry = false; - int error; + int error = -EINVAL; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) - return -EINVAL; + goto put_both; if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && (flags & RENAME_EXCHANGE)) - return -EINVAL; + goto put_both; if (flags & RENAME_EXCHANGE) target_flags = 0; retry: - from = filename_parentat(olddfd, getname(oldname), lookup_flags, - &old_path, &old_last, &old_type); + from = filename_parentat(olddfd, from, lookup_flags, &old_path, + &old_last, &old_type); if (IS_ERR(from)) { error = PTR_ERR(from); - goto exit; + goto put_new; } - to = filename_parentat(newdfd, getname(newname), lookup_flags, - &new_path, &new_last, &new_type); + to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, + &new_type); if (IS_ERR(to)) { error = PTR_ERR(to); goto exit1; @@ -4473,34 +4471,40 @@ exit2: if (retry_estale(error, lookup_flags)) should_retry = true; path_put(&new_path); - putname(to); exit1: path_put(&old_path); - putname(from); if (should_retry) { should_retry = false; lookup_flags |= LOOKUP_REVAL; goto retry; } -exit: +put_both: + if (!IS_ERR(from)) + putname(from); +put_new: + if (!IS_ERR(to)) + putname(to); return error; } SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags) { - return do_renameat2(olddfd, oldname, newdfd, newname, flags); + return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), + flags); } SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname) { - return do_renameat2(olddfd, oldname, newdfd, newname, 0); + return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), + 0); } SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { - return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); + return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD, + getname(newname), 0); } int readlink_copy(char __user *buffer, int buflen, const char *link) From 80a261fd00327898e272ddc84ccc9510c036453c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 28 Sep 2020 14:23:58 -0600 Subject: [PATCH 07/43] io_uring: add support for IORING_OP_RENAMEAT IORING_OP_RENAMEAT behaves like renameat2(), and takes the same flags etc. Signed-off-by: Jens Axboe --- fs/io_uring.c | 70 +++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 2 + 2 files changed, 72 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1824bd4329ee..94a5e1618368 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -546,6 +546,15 @@ struct io_shutdown { int how; }; +struct io_rename { + struct file *file; + int old_dfd; + int new_dfd; + struct filename *oldpath; + struct filename *newpath; + int flags; +}; + struct io_completion { struct file *file; struct list_head list; @@ -673,6 +682,7 @@ struct io_kiocb { struct io_provide_buf pbuf; struct io_statx statx; struct io_shutdown shutdown; + struct io_rename rename; /* use only after cleaning per-op data, see io_clean_op() */ struct io_completion compl; }; @@ -943,6 +953,10 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_SHUTDOWN] = { .needs_file = 1, }, + [IORING_OP_RENAMEAT] = { + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, + }, }; enum io_mem_account { @@ -3645,6 +3659,53 @@ out_free: return ret; } +static int io_renameat_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_rename *ren = &req->rename; + const char __user *oldf, *newf; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ren->old_dfd = READ_ONCE(sqe->fd); + oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ren->new_dfd = READ_ONCE(sqe->len); + ren->flags = READ_ONCE(sqe->rename_flags); + + ren->oldpath = getname(oldf); + if (IS_ERR(ren->oldpath)) + return PTR_ERR(ren->oldpath); + + ren->newpath = getname(newf); + if (IS_ERR(ren->newpath)) { + putname(ren->oldpath); + return PTR_ERR(ren->newpath); + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_renameat(struct io_kiocb *req, bool force_nonblock) +{ + struct io_rename *ren = &req->rename; + int ret; + + if (force_nonblock) + return -EAGAIN; + + ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, + ren->newpath, ren->flags); + + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret < 0) + req_set_fail_links(req); + io_req_complete(req, ret); + return 0; +} + static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -5869,6 +5930,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_tee_prep(req, sqe); case IORING_OP_SHUTDOWN: return io_shutdown_prep(req, sqe); + case IORING_OP_RENAMEAT: + return io_renameat_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6006,6 +6069,10 @@ static void __io_clean_op(struct io_kiocb *req) if (req->open.filename) putname(req->open.filename); break; + case IORING_OP_RENAMEAT: + putname(req->rename.oldpath); + putname(req->rename.newpath); + break; } req->flags &= ~REQ_F_NEED_CLEANUP; } @@ -6115,6 +6182,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, case IORING_OP_SHUTDOWN: ret = io_shutdown(req, force_nonblock); break; + case IORING_OP_RENAMEAT: + ret = io_renameat(req, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 2301c37e86cb..c9a58bc7e4be 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -42,6 +42,7 @@ struct io_uring_sqe { __u32 statx_flags; __u32 fadvise_advice; __u32 splice_flags; + __u32 rename_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -133,6 +134,7 @@ enum { IORING_OP_REMOVE_BUFFERS, IORING_OP_TEE, IORING_OP_SHUTDOWN, + IORING_OP_RENAMEAT, /* this goes last, obviously */ IORING_OP_LAST, From 14a1143b68ee2e4ec4e8d54f71cddb9724f9ec70 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 28 Sep 2020 14:27:37 -0600 Subject: [PATCH 08/43] io_uring: add support for IORING_OP_UNLINKAT IORING_OP_UNLINKAT behaves like unlinkat(2) and takes the same flags and arguments. Signed-off-by: Jens Axboe --- fs/io_uring.c | 64 +++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 2 ++ 2 files changed, 66 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 94a5e1618368..c8ecbc0bd286 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -555,6 +555,13 @@ struct io_rename { int flags; }; +struct io_unlink { + struct file *file; + int dfd; + int flags; + struct filename *filename; +}; + struct io_completion { struct file *file; struct list_head list; @@ -683,6 +690,7 @@ struct io_kiocb { struct io_statx statx; struct io_shutdown shutdown; struct io_rename rename; + struct io_unlink unlink; /* use only after cleaning per-op data, see io_clean_op() */ struct io_completion compl; }; @@ -957,6 +965,10 @@ static const struct io_op_def io_op_defs[] = { .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, }, + [IORING_OP_UNLINKAT] = { + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, + }, }; enum io_mem_account { @@ -3706,6 +3718,50 @@ static int io_renameat(struct io_kiocb *req, bool force_nonblock) return 0; } +static int io_unlinkat_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_unlink *un = &req->unlink; + const char __user *fname; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + un->dfd = READ_ONCE(sqe->fd); + + un->flags = READ_ONCE(sqe->unlink_flags); + if (un->flags & ~AT_REMOVEDIR) + return -EINVAL; + + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + un->filename = getname(fname); + if (IS_ERR(un->filename)) + return PTR_ERR(un->filename); + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_unlinkat(struct io_kiocb *req, bool force_nonblock) +{ + struct io_unlink *un = &req->unlink; + int ret; + + if (force_nonblock) + return -EAGAIN; + + if (un->flags & AT_REMOVEDIR) + ret = do_rmdir(un->dfd, un->filename); + else + ret = do_unlinkat(un->dfd, un->filename); + + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret < 0) + req_set_fail_links(req); + io_req_complete(req, ret); + return 0; +} + static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -5932,6 +5988,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_shutdown_prep(req, sqe); case IORING_OP_RENAMEAT: return io_renameat_prep(req, sqe); + case IORING_OP_UNLINKAT: + return io_unlinkat_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6073,6 +6131,9 @@ static void __io_clean_op(struct io_kiocb *req) putname(req->rename.oldpath); putname(req->rename.newpath); break; + case IORING_OP_UNLINKAT: + putname(req->unlink.filename); + break; } req->flags &= ~REQ_F_NEED_CLEANUP; } @@ -6185,6 +6246,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, case IORING_OP_RENAMEAT: ret = io_renameat(req, force_nonblock); break; + case IORING_OP_UNLINKAT: + ret = io_unlinkat(req, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index c9a58bc7e4be..557e7eae497f 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -43,6 +43,7 @@ struct io_uring_sqe { __u32 fadvise_advice; __u32 splice_flags; __u32 rename_flags; + __u32 unlink_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -135,6 +136,7 @@ enum { IORING_OP_TEE, IORING_OP_SHUTDOWN, IORING_OP_RENAMEAT, + IORING_OP_UNLINKAT, /* this goes last, obviously */ IORING_OP_LAST, From 018043be1f1bc43ad6956bfd39b7beea12fb4ca6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Oct 2020 23:17:18 +0000 Subject: [PATCH 09/43] io_uring: split poll and poll_remove structs Don't use a single struct for polls and poll remove requests, they have totally different layouts. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c8ecbc0bd286..10cfb6d17994 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -395,16 +395,18 @@ struct io_ring_ctx { */ struct io_poll_iocb { struct file *file; - union { - struct wait_queue_head *head; - u64 addr; - }; + struct wait_queue_head *head; __poll_t events; bool done; bool canceled; struct wait_queue_entry wait; }; +struct io_poll_remove { + struct file *file; + u64 addr; +}; + struct io_close { struct file *file; struct file *put_file; @@ -672,6 +674,7 @@ struct io_kiocb { struct file *file; struct io_rw rw; struct io_poll_iocb poll; + struct io_poll_remove poll_remove; struct io_accept accept; struct io_sync sync; struct io_cancel cancel; @@ -5538,7 +5541,7 @@ static int io_poll_remove_prep(struct io_kiocb *req, sqe->poll_events) return -EINVAL; - req->poll.addr = READ_ONCE(sqe->addr); + req->poll_remove.addr = READ_ONCE(sqe->addr); return 0; } @@ -5549,12 +5552,10 @@ static int io_poll_remove_prep(struct io_kiocb *req, static int io_poll_remove(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - u64 addr; int ret; - addr = req->poll.addr; spin_lock_irq(&ctx->completion_lock); - ret = io_poll_cancel(ctx, addr); + ret = io_poll_cancel(ctx, req->poll_remove.addr); spin_unlock_irq(&ctx->completion_lock); if (ret < 0) From 863e05604a6fb45f0f56b3e9eca5cd533001253b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Oct 2020 23:25:35 +0000 Subject: [PATCH 10/43] io_uring: track link's head and tail during submit Explicitly save not only a link's head in io_submit_sqe[s]() but the tail as well. That's in preparation for keeping linked requests in a singly linked list. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 10cfb6d17994..0441185ac510 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6536,8 +6536,13 @@ static inline void io_queue_link_head(struct io_kiocb *req, io_queue_sqe(req, NULL, cs); } +struct io_submit_link { + struct io_kiocb *head; + struct io_kiocb *last; +}; + static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **link, struct io_comp_state *cs) + struct io_submit_link *link, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -6549,8 +6554,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, * submitted sync once the chain is complete. If none of those * conditions are true (normal request), then just queue it. */ - if (*link) { - struct io_kiocb *head = *link; + if (link->head) { + struct io_kiocb *head = link->head; /* * Taking sequential execution of a link, draining both sides @@ -6571,11 +6576,12 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } trace_io_uring_link(ctx, req, head); list_add_tail(&req->link_list, &head->link_list); + link->last = req; /* last request of a link, enqueue the link */ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { io_queue_link_head(head, cs); - *link = NULL; + link->head = NULL; } } else { if (unlikely(ctx->drain_next)) { @@ -6589,7 +6595,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = io_req_defer_prep(req, sqe); if (unlikely(ret)) req->flags |= REQ_F_FAIL_LINK; - *link = req; + link->head = req; + link->last = req; } else { io_queue_sqe(req, sqe, cs); } @@ -6769,7 +6776,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) { struct io_submit_state state; - struct io_kiocb *link = NULL; + struct io_submit_link link; int i, submitted = 0; /* if we have a backlog and couldn't flush it all, return BUSY */ @@ -6789,6 +6796,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) refcount_add(nr, ¤t->usage); io_submit_state_start(&state, ctx, nr); + link.head = NULL; for (i = 0; i < nr; i++) { const struct io_uring_sqe *sqe; @@ -6834,8 +6842,8 @@ fail_req: percpu_counter_sub(&tctx->inflight, unused); put_task_struct_many(current, unused); } - if (link) - io_queue_link_head(link, &state.comp); + if (link.head) + io_queue_link_head(link.head, &state.comp); io_submit_state_end(&state); /* Commit SQ ring head once we've consumed and submitted all SQEs */ From 90cd7e424969d29aff653333b4dcb4e2e199d791 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Oct 2020 23:25:36 +0000 Subject: [PATCH 11/43] io_uring: track link timeout's master explicitly In preparation for converting singly linked lists for chaining requests, make linked timeouts save requests that they're responsible for and not count on doubly linked list for back referencing. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0441185ac510..2d14a4f636d6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -446,6 +446,8 @@ struct io_timeout { u32 off; u32 target_seq; struct list_head list; + /* head of the link, used by linked timeouts only */ + struct io_kiocb *head; }; struct io_timeout_rem { @@ -1984,6 +1986,7 @@ static void io_kill_linked_timeout(struct io_kiocb *req) int ret; list_del_init(&link->link_list); + link->timeout.head = NULL; ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { io_cqring_fill_event(link, -ECANCELED); @@ -6358,26 +6361,22 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, struct io_timeout_data, timer); - struct io_kiocb *req = data->req; + struct io_kiocb *prev, *req = data->req; struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *prev = NULL; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); + prev = req->timeout.head; + req->timeout.head = NULL; /* * We don't expect the list to be empty, that will only happen if we * race with the completion of the linked work. */ - if (!list_empty(&req->link_list)) { - prev = list_entry(req->link_list.prev, struct io_kiocb, - link_list); - if (refcount_inc_not_zero(&prev->refs)) - list_del_init(&req->link_list); - else - prev = NULL; - } - + if (prev && refcount_inc_not_zero(&prev->refs)) + list_del_init(&req->link_list); + else + prev = NULL; spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { @@ -6396,7 +6395,7 @@ static void __io_queue_linked_timeout(struct io_kiocb *req) * If the list is now empty, then our linked request finished before * we got a chance to setup the timer */ - if (!list_empty(&req->link_list)) { + if (req->timeout.head) { struct io_timeout_data *data = req->async_data; data->timer.function = io_link_timeout_fn; @@ -6431,6 +6430,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) return NULL; + nxt->timeout.head = req; nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; req->flags |= REQ_F_LINK_TIMEOUT; return nxt; From f2f87370bb6664e5babb6705e886cfb340f163e1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Oct 2020 23:25:37 +0000 Subject: [PATCH 12/43] io_uring: link requests with singly linked list Singly linked list for keeping linked requests is enough, because we almost always operate on the head and traverse forward with the exception of linked timeouts going 1 hop backwards. Replace ->link_list with a handmade singly linked list. Also kill REQ_F_LINK_HEAD in favour of checking a newly added ->list for NULL directly. That saves 8B in io_kiocb, is not as heavy as list fixup, makes better use of cache by not touching a previous request (i.e. last request of the link) each time on list modification and optimises cache use further in the following patch, and actually makes travesal easier removing in the end some lines. Also, keeping invariant in ->list instead of having REQ_F_LINK_HEAD is less error-prone. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 133 ++++++++++++++++++++------------------------------ 1 file changed, 52 insertions(+), 81 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2d14a4f636d6..a0a6d13c0c16 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -600,7 +600,6 @@ enum { REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, - REQ_F_LINK_HEAD_BIT, REQ_F_FAIL_LINK_BIT, REQ_F_INFLIGHT_BIT, REQ_F_CUR_POS_BIT, @@ -632,8 +631,6 @@ enum { /* IOSQE_BUFFER_SELECT */ REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), - /* head of a link */ - REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT), /* fail rest of links */ REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), /* on inflight list */ @@ -715,7 +712,7 @@ struct io_kiocb { struct task_struct *task; u64 user_data; - struct list_head link_list; + struct io_kiocb *link; /* * 1. used with ctx->iopoll_list with reads/writes @@ -1023,6 +1020,9 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +#define io_for_each_link(pos, head) \ + for (pos = (head); pos; pos = pos->link) + static inline void io_clean_op(struct io_kiocb *req) { if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | @@ -1501,10 +1501,8 @@ static void io_prep_async_link(struct io_kiocb *req) { struct io_kiocb *cur; - io_prep_async_work(req); - if (req->flags & REQ_F_LINK_HEAD) - list_for_each_entry(cur, &req->link_list, link_list) - io_prep_async_work(cur); + io_for_each_link(cur, req) + io_prep_async_work(cur); } static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) @@ -1687,20 +1685,15 @@ static inline bool __io_match_files(struct io_kiocb *req, req->work.identity->files == files; } -static bool io_match_files(struct io_kiocb *req, - struct files_struct *files) +static bool io_match_files(struct io_kiocb *head, struct files_struct *files) { - struct io_kiocb *link; + struct io_kiocb *req; if (!files) return true; - if (__io_match_files(req, files)) - return true; - if (req->flags & REQ_F_LINK_HEAD) { - list_for_each_entry(link, &req->link_list, link_list) { - if (__io_match_files(link, files)) - return true; - } + io_for_each_link(req, head) { + if (__io_match_files(req, files)) + return true; } return false; } @@ -1967,6 +1960,14 @@ static void __io_free_req(struct io_kiocb *req) percpu_ref_put(&ctx->refs); } +static inline void io_remove_next_linked(struct io_kiocb *req) +{ + struct io_kiocb *nxt = req->link; + + req->link = nxt->link; + nxt->link = NULL; +} + static void io_kill_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1975,8 +1976,8 @@ static void io_kill_linked_timeout(struct io_kiocb *req) unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - link = list_first_entry_or_null(&req->link_list, struct io_kiocb, - link_list); + link = req->link; + /* * Can happen if a linked timeout fired and link had been like * req -> link t-out -> link t-out [-> ...] @@ -1985,7 +1986,7 @@ static void io_kill_linked_timeout(struct io_kiocb *req) struct io_timeout_data *io = link->async_data; int ret; - list_del_init(&link->link_list); + io_remove_next_linked(req); link->timeout.head = NULL; ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { @@ -2003,41 +2004,22 @@ static void io_kill_linked_timeout(struct io_kiocb *req) } } -static struct io_kiocb *io_req_link_next(struct io_kiocb *req) -{ - struct io_kiocb *nxt; - /* - * The list should never be empty when we are called here. But could - * potentially happen if the chain is messed up, check to be on the - * safe side. - */ - if (unlikely(list_empty(&req->link_list))) - return NULL; - - nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list); - list_del_init(&req->link_list); - if (!list_empty(&nxt->link_list)) - nxt->flags |= REQ_F_LINK_HEAD; - return nxt; -} - -/* - * Called if REQ_F_LINK_HEAD is set, and we fail the head request - */ static void io_fail_links(struct io_kiocb *req) { + struct io_kiocb *link, *nxt; struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - while (!list_empty(&req->link_list)) { - struct io_kiocb *link = list_first_entry(&req->link_list, - struct io_kiocb, link_list); + link = req->link; + req->link = NULL; + + while (link) { + nxt = link->link; + link->link = NULL; - list_del_init(&link->link_list); trace_io_uring_fail_link(req, link); - io_cqring_fill_event(link, -ECANCELED); /* @@ -2049,8 +2031,8 @@ static void io_fail_links(struct io_kiocb *req) io_put_req_deferred(link, 2); else io_double_put_req(link); + link = nxt; } - io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -2059,7 +2041,6 @@ static void io_fail_links(struct io_kiocb *req) static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) { - req->flags &= ~REQ_F_LINK_HEAD; if (req->flags & REQ_F_LINK_TIMEOUT) io_kill_linked_timeout(req); @@ -2069,15 +2050,19 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (likely(!(req->flags & REQ_F_FAIL_LINK))) - return io_req_link_next(req); + if (likely(!(req->flags & REQ_F_FAIL_LINK))) { + struct io_kiocb *nxt = req->link; + + req->link = NULL; + return nxt; + } io_fail_links(req); return NULL; } -static struct io_kiocb *io_req_find_next(struct io_kiocb *req) +static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) { - if (likely(!(req->flags & REQ_F_LINK_HEAD))) + if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT))) return NULL; return __io_req_find_next(req); } @@ -2173,7 +2158,7 @@ static void io_req_task_queue(struct io_kiocb *req) } } -static void io_queue_next(struct io_kiocb *req) +static inline void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); @@ -2230,8 +2215,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) io_free_req(req); return; } - if (req->flags & REQ_F_LINK_HEAD) - io_queue_next(req); + io_queue_next(req); if (req->task != rb->task) { if (rb->task) { @@ -6015,11 +5999,10 @@ static u32 io_get_sequence(struct io_kiocb *req) { struct io_kiocb *pos; struct io_ring_ctx *ctx = req->ctx; - u32 total_submitted, nr_reqs = 1; + u32 total_submitted, nr_reqs = 0; - if (req->flags & REQ_F_LINK_HEAD) - list_for_each_entry(pos, &req->link_list, link_list) - nr_reqs++; + io_for_each_link(pos, req) + nr_reqs++; total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; return total_submitted - nr_reqs; @@ -6374,7 +6357,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) * race with the completion of the linked work. */ if (prev && refcount_inc_not_zero(&prev->refs)) - list_del_init(&req->link_list); + io_remove_next_linked(prev); else prev = NULL; spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -6392,8 +6375,8 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) static void __io_queue_linked_timeout(struct io_kiocb *req) { /* - * If the list is now empty, then our linked request finished before - * we got a chance to setup the timer + * If the back reference is NULL, then our linked request finished + * before we got a chance to setup the timer */ if (req->timeout.head) { struct io_timeout_data *data = req->async_data; @@ -6418,16 +6401,10 @@ static void io_queue_linked_timeout(struct io_kiocb *req) static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) { - struct io_kiocb *nxt; + struct io_kiocb *nxt = req->link; - if (!(req->flags & REQ_F_LINK_HEAD)) - return NULL; - if (req->flags & REQ_F_LINK_TIMEOUT) - return NULL; - - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, - link_list); - if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) + if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) || + nxt->opcode != IORING_OP_LINK_TIMEOUT) return NULL; nxt->timeout.head = req; @@ -6575,7 +6552,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return ret; } trace_io_uring_link(ctx, req, head); - list_add_tail(&req->link_list, &head->link_list); + link->last->link = req; link->last = req; /* last request of a link, enqueue the link */ @@ -6589,9 +6566,6 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ctx->drain_next = 0; } if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - req->flags |= REQ_F_LINK_HEAD; - INIT_LIST_HEAD(&req->link_list); - ret = io_req_defer_prep(req, sqe); if (unlikely(ret)) req->flags |= REQ_F_FAIL_LINK; @@ -6724,6 +6698,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->file = NULL; req->ctx = ctx; req->flags = 0; + req->link = NULL; /* one is dropped after submission, the other at completion */ refcount_set(&req->refs, 2); req->task = current; @@ -8682,14 +8657,10 @@ static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) { struct io_kiocb *link; - if (!(preq->flags & REQ_F_LINK_HEAD)) - return false; - - list_for_each_entry(link, &preq->link_list, link_list) { + io_for_each_link(link, preq->link) { if (link == req) return true; } - return false; } From 0415767e7f0542b3cd1ab270c2e61e90e87aafa2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Oct 2020 23:25:38 +0000 Subject: [PATCH 13/43] io_uring: rearrange io_kiocb fields for better caching We've got extra 8 bytes in the 2nd cacheline, put ->fixed_file_refs there, so inline execution path mostly doesn't touch the 3rd cacheline for fixed_file requests as well. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a0a6d13c0c16..b651d6e6d609 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -713,14 +713,13 @@ struct io_kiocb { u64 user_data; struct io_kiocb *link; + struct percpu_ref *fixed_file_refs; /* * 1. used with ctx->iopoll_list with reads/writes * 2. to track reqs with ->files (see io_op_def::file_table) */ struct list_head inflight_entry; - - struct percpu_ref *fixed_file_refs; struct callback_head task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; From 27926b683db03be307c6905b44ecfc1f081d9d6f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 Oct 2020 09:33:23 -0600 Subject: [PATCH 14/43] io_uring: only plug when appropriate We unconditionally call blk_start_plug() when starting the IO submission, but we only really should do that if we have more than 1 request to submit AND we're potentially dealing with block based storage underneath. For any other type of request, it's just a waste of time to do so. Add a ->plug bit to io_op_def and set it for read/write requests. We could make this more precise and check the file itself as well, but it doesn't matter that much and would quickly become more expensive. Signed-off-by: Jens Axboe --- fs/io_uring.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b651d6e6d609..11ce97d6259c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -750,6 +750,8 @@ struct io_submit_state { void *reqs[IO_IOPOLL_BATCH]; unsigned int free_reqs; + bool plug_started; + /* * Batch completion logic */ @@ -782,6 +784,8 @@ struct io_op_def { unsigned buffer_select : 1; /* must always have async data allocated */ unsigned needs_async_data : 1; + /* should block plug */ + unsigned plug : 1; /* size of async data needed, if any */ unsigned short async_size; unsigned work_flags; @@ -795,6 +799,7 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .buffer_select = 1, .needs_async_data = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, @@ -804,6 +809,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .needs_async_data = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE, @@ -816,6 +822,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, }, @@ -824,6 +831,7 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE | IO_WQ_WORK_MM, @@ -907,6 +915,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, @@ -914,6 +923,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE, @@ -6585,7 +6595,8 @@ static void io_submit_state_end(struct io_submit_state *state) { if (!list_empty(&state->comp.list)) io_submit_flush_completions(&state->comp); - blk_finish_plug(&state->plug); + if (state->plug_started) + blk_finish_plug(&state->plug); io_state_file_put(state); if (state->free_reqs) kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); @@ -6597,7 +6608,7 @@ static void io_submit_state_end(struct io_submit_state *state) static void io_submit_state_start(struct io_submit_state *state, struct io_ring_ctx *ctx, unsigned int max_ios) { - blk_start_plug(&state->plug); + state->plug_started = false; state->comp.nr = 0; INIT_LIST_HEAD(&state->comp.list); state->comp.ctx = ctx; @@ -6739,6 +6750,16 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags |= sqe_flags; + /* + * Plug now if we have more than 1 IO left after this, and the target + * is potentially a read/write to block based storage. + */ + if (!state->plug_started && state->ios_left > 1 && + io_op_defs[req->opcode].plug) { + blk_start_plug(&state->plug); + state->plug_started = true; + } + if (!io_op_defs[req->opcode].needs_file) return 0; From c73ebb685fb6dfb513d394cbea64fb81ba3d994f Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Tue, 3 Nov 2020 10:54:37 +0800 Subject: [PATCH 15/43] io_uring: add timeout support for io_uring_enter() Now users who want to get woken when waiting for events should submit a timeout command first. It is not safe for applications that split SQ and CQ handling between two threads, such as mysql. Users should synchronize the two threads explicitly to protect SQ and that will impact the performance. This patch adds support for timeout to existing io_uring_enter(). To avoid overloading arguments, it introduces a new parameter structure which contains sigmask and timeout. I have tested the workloads with one thread submiting nop requests while the other reaping the cqe with timeout. It shows 1.8~2x faster when the iodepth is 16. Signed-off-by: Jiufei Xue Signed-off-by: Hao Xu [axboe: various cleanups/fixes, and name change to SIG_IS_DATA] Signed-off-by: Jens Axboe --- fs/io_uring.c | 69 +++++++++++++++++++++++++++++++---- include/linux/syscalls.h | 2 +- include/uapi/linux/io_uring.h | 9 +++++ 3 files changed, 72 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 11ce97d6259c..ee25c70527aa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7118,7 +7118,8 @@ static int io_run_task_work_sig(void) * application must reap them itself, as they reside on the shared cq ring. */ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, - const sigset_t __user *sig, size_t sigsz) + const sigset_t __user *sig, size_t sigsz, + struct __kernel_timespec __user *uts) { struct io_wait_queue iowq = { .wq = { @@ -7130,6 +7131,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, .to_wait = min_events, }; struct io_rings *rings = ctx->rings; + struct timespec64 ts; + signed long timeout = 0; int ret = 0; do { @@ -7152,6 +7155,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } + if (uts) { + if (get_timespec64(&ts, uts)) + return -EFAULT; + timeout = timespec64_to_jiffies(&ts); + } + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); trace_io_uring_cqring_wait(ctx, min_events); do { @@ -7165,7 +7174,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, break; if (io_should_wake(&iowq, false)) break; - schedule(); + if (uts) { + timeout = schedule_timeout(timeout); + if (timeout == 0) { + ret = -ETIME; + break; + } + } else { + schedule(); + } } while (1); finish_wait(&ctx->wait, &iowq.wq); @@ -9167,9 +9184,39 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) finish_wait(&ctx->sqo_sq_wait, &wait); } +static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, + struct __kernel_timespec __user **ts, + const sigset_t __user **sig) +{ + struct io_uring_getevents_arg arg; + + /* + * If EXT_ARG isn't set, then we have no timespec and the argp pointer + * is just a pointer to the sigset_t. + */ + if (!(flags & IORING_ENTER_EXT_ARG)) { + *sig = (const sigset_t __user *) argp; + *ts = NULL; + return 0; + } + + /* + * EXT_ARG is set - ensure we agree on the size of it and copy in our + * timespec and sigset_t pointers if good. + */ + if (*argsz != sizeof(arg)) + return -EINVAL; + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + *sig = u64_to_user_ptr(arg.sigmask); + *argsz = arg.sigmask_sz; + *ts = u64_to_user_ptr(arg.ts); + return 0; +} + SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, - u32, min_complete, u32, flags, const sigset_t __user *, sig, - size_t, sigsz) + u32, min_complete, u32, flags, const void __user *, argp, + size_t, argsz) { struct io_ring_ctx *ctx; long ret = -EBADF; @@ -9179,7 +9226,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_run_task_work(); if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | - IORING_ENTER_SQ_WAIT)) + IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)) return -EINVAL; f = fdget(fd); @@ -9225,6 +9272,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out; } if (flags & IORING_ENTER_GETEVENTS) { + const sigset_t __user *sig; + struct __kernel_timespec __user *ts; + + ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); + if (unlikely(ret)) + goto out; + min_complete = min(min_complete, ctx->cq_entries); /* @@ -9237,7 +9291,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, !(ctx->flags & IORING_SETUP_SQPOLL)) { ret = io_iopoll_check(ctx, min_complete); } else { - ret = io_cqring_wait(ctx, min_complete, sig, sigsz); + ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); } } @@ -9600,7 +9654,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | - IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED; + IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | + IORING_FEAT_EXT_ARG; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 37bea07c12f2..8576e8bf92fe 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -317,7 +317,7 @@ asmlinkage long sys_io_uring_setup(u32 entries, struct io_uring_params __user *p); asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, u32 min_complete, u32 flags, - const sigset_t __user *sig, size_t sigsz); + const void __user *argp, size_t argsz); asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op, void __user *arg, unsigned int nr_args); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 557e7eae497f..6bb8229de892 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -231,6 +231,7 @@ struct io_cqring_offsets { #define IORING_ENTER_GETEVENTS (1U << 0) #define IORING_ENTER_SQ_WAKEUP (1U << 1) #define IORING_ENTER_SQ_WAIT (1U << 2) +#define IORING_ENTER_EXT_ARG (1U << 3) /* * Passed in for io_uring_setup(2). Copied back with updated info on success @@ -259,6 +260,7 @@ struct io_uring_params { #define IORING_FEAT_FAST_POLL (1U << 5) #define IORING_FEAT_POLL_32BITS (1U << 6) #define IORING_FEAT_SQPOLL_NONFIXED (1U << 7) +#define IORING_FEAT_EXT_ARG (1U << 8) /* * io_uring_register(2) opcodes and arguments @@ -335,4 +337,11 @@ enum { IORING_RESTRICTION_LAST }; +struct io_uring_getevents_arg { + __u64 sigmask; + __u32 sigmask_sz; + __u32 pad; + __u64 ts; +}; + #endif From 1a38ffc9cbca361cc274d6e234f5ef8922f0b6d9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 8 Nov 2020 12:55:55 +0000 Subject: [PATCH 16/43] io_uring: NULL files dereference by SQPOLL SQPOLL task may find sqo_task->files == NULL and __io_sq_thread_acquire_files() would leave it unset, so following fget_many() and others try to dereference NULL and fault. Propagate an error files are missing. [ 118.962785] BUG: kernel NULL pointer dereference, address: 0000000000000020 [ 118.963812] #PF: supervisor read access in kernel mode [ 118.964534] #PF: error_code(0x0000) - not-present page [ 118.969029] RIP: 0010:__fget_files+0xb/0x80 [ 119.005409] Call Trace: [ 119.005651] fget_many+0x2b/0x30 [ 119.005964] io_file_get+0xcf/0x180 [ 119.006315] io_submit_sqes+0x3a4/0x950 [ 119.007481] io_sq_thread+0x1de/0x6a0 [ 119.007828] kthread+0x114/0x150 [ 119.008963] ret_from_fork+0x22/0x30 Reported-by: Josef Grieb Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ee25c70527aa..775a83ebc8ab 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1061,7 +1061,7 @@ static void io_sq_thread_drop_mm_files(void) } } -static void __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) +static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) { if (!current->files) { struct files_struct *files; @@ -1071,7 +1071,7 @@ static void __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) files = ctx->sqo_task->files; if (!files) { task_unlock(ctx->sqo_task); - return; + return -EOWNERDEAD; } atomic_inc(&files->count); get_nsproxy(ctx->sqo_task->nsproxy); @@ -1083,6 +1083,7 @@ static void __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) current->nsproxy = nsproxy; task_unlock(current); } + return 0; } static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) @@ -1114,15 +1115,19 @@ static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; + int ret; if (def->work_flags & IO_WQ_WORK_MM) { - int ret = __io_sq_thread_acquire_mm(ctx); + ret = __io_sq_thread_acquire_mm(ctx); if (unlikely(ret)) return ret; } - if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) - __io_sq_thread_acquire_files(ctx); + if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) { + ret = __io_sq_thread_acquire_files(ctx); + if (unlikely(ret)) + return ret; + } return 0; } @@ -2130,8 +2135,8 @@ static void __io_req_task_submit(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - if (!__io_sq_thread_acquire_mm(ctx)) { - __io_sq_thread_acquire_files(ctx); + if (!__io_sq_thread_acquire_mm(ctx) && + !__io_sq_thread_acquire_files(ctx)) { mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL); mutex_unlock(&ctx->uring_lock); From 10fc72e43352753a08f9cf83aa5c40baec00d212 Mon Sep 17 00:00:00 2001 From: David Laight Date: Sat, 7 Nov 2020 13:16:25 +0000 Subject: [PATCH 17/43] fs/io_uring Don't use the return value from import_iovec(). This is the only code that relies on import_iovec() returning iter.count on success. This allows a better interface to import_iovec(). Signed-off-by: David Laight Signed-off-by: Pavel Begunkov Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 775a83ebc8ab..c1ac352e3e15 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3179,7 +3179,7 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, ret = import_single_range(rw, buf, sqe_len, *iovec, iter); *iovec = NULL; - return ret < 0 ? ret : sqe_len; + return ret; } if (req->flags & REQ_F_BUFFER_SELECT) { @@ -3205,7 +3205,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (!iorw) return __io_import_iovec(rw, req, iovec, iter, needs_lock); *iovec = NULL; - return iov_iter_count(&iorw->iter); + return 0; } static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) @@ -3474,7 +3474,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (ret < 0) return ret; iov_count = iov_iter_count(iter); - io_size = ret; + io_size = iov_count; req->result = io_size; ret = 0; @@ -3602,7 +3602,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, if (ret < 0) return ret; iov_count = iov_iter_count(iter); - io_size = ret; + io_size = iov_count; req->result = io_size; /* Ensure we clear previously set non-block flag */ From 632546c4b5a4dad8e3ac456406c65c0db9a0b570 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 7 Nov 2020 13:16:26 +0000 Subject: [PATCH 18/43] io_uring: remove duplicated io_size from rw io_size and iov_count in io_read() and io_write() hold the same value, kill the last one. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c1ac352e3e15..b74048b5134a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3464,7 +3464,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, struct iov_iter __iter, *iter = &__iter; struct io_async_rw *rw = req->async_data; ssize_t io_size, ret, ret2; - size_t iov_count; bool no_async; if (rw) @@ -3473,8 +3472,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; - iov_count = iov_iter_count(iter); - io_size = iov_count; + io_size = iov_iter_count(iter); req->result = io_size; ret = 0; @@ -3490,7 +3488,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (no_async) goto copy_iov; - ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count); + ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size); if (unlikely(ret)) goto out_free; @@ -3509,7 +3507,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (req->file->f_flags & O_NONBLOCK) goto done; /* some cases will consume bytes even on error returns */ - iov_iter_revert(iter, iov_count - iov_iter_count(iter)); + iov_iter_revert(iter, io_size - iov_iter_count(iter)); ret = 0; goto copy_iov; } else if (ret < 0) { @@ -3592,7 +3590,6 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter __iter, *iter = &__iter; struct io_async_rw *rw = req->async_data; - size_t iov_count; ssize_t ret, ret2, io_size; if (rw) @@ -3601,8 +3598,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; - iov_count = iov_iter_count(iter); - io_size = iov_count; + io_size = iov_iter_count(iter); req->result = io_size; /* Ensure we clear previously set non-block flag */ @@ -3620,7 +3616,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, (req->flags & REQ_F_ISREG)) goto copy_iov; - ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count); + ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size); if (unlikely(ret)) goto out_free; @@ -3663,7 +3659,7 @@ done: } else { copy_iov: /* some cases will consume bytes even on error returns */ - iov_iter_revert(iter, iov_count - iov_iter_count(iter)); + iov_iter_revert(iter, io_size - iov_iter_count(iter)); ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (!ret) return -EAGAIN; From 2846c481c9dd1f1fb504b4885bcb815c311df532 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 7 Nov 2020 13:16:27 +0000 Subject: [PATCH 19/43] io_uring: inline io_import_iovec() Inline io_import_iovec() and leave only its former __io_import_iovec() renamed to the original name. That makes it more obious what is reused in io_read/write(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b74048b5134a..12a213b147c0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3150,7 +3150,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, return __io_iov_buffer_select(req, iov, needs_lock); } -static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, bool needs_lock) { @@ -3196,18 +3196,6 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, req->ctx->compat); } -static ssize_t io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct iov_iter *iter, - bool needs_lock) -{ - struct io_async_rw *iorw = req->async_data; - - if (!iorw) - return __io_import_iovec(rw, req, iovec, iter, needs_lock); - *iovec = NULL; - return 0; -} - static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) { return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; @@ -3331,7 +3319,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) struct iovec *iov = iorw->fast_iov; ssize_t ret; - ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false); + ret = io_import_iovec(rw, req, &iov, &iorw->iter, false); if (unlikely(ret < 0)) return ret; @@ -3466,12 +3454,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ssize_t io_size, ret, ret2; bool no_async; - if (rw) + if (rw) { iter = &rw->iter; - - ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); - if (ret < 0) - return ret; + iovec = NULL; + } else { + ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); + if (ret < 0) + return ret; + } io_size = iov_iter_count(iter); req->result = io_size; ret = 0; @@ -3592,12 +3582,14 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, struct io_async_rw *rw = req->async_data; ssize_t ret, ret2, io_size; - if (rw) + if (rw) { iter = &rw->iter; - - ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); - if (ret < 0) - return ret; + iovec = NULL; + } else { + ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); + if (ret < 0) + return ret; + } io_size = iov_iter_count(iter); req->result = io_size; From 06de5f5973c641c7ae033f133ecfaaf64fe633a6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 6 Nov 2020 13:00:21 +0000 Subject: [PATCH 20/43] io_uring: simplify io_task_match() If IORING_SETUP_SQPOLL is set all requests belong to the corresponding SQPOLL task, so skip task checking in that case and always match. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 12a213b147c0..baff81313f54 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1563,11 +1563,7 @@ static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk) if (!tsk || req->task == tsk) return true; - if (ctx->flags & IORING_SETUP_SQPOLL) { - if (ctx->sq_data && req->task == ctx->sq_data->thread) - return true; - } - return false; + return (ctx->flags & IORING_SETUP_SQPOLL); } /* From 08d23634643c239ddae706758f54d3a8e0c24962 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 6 Nov 2020 13:00:22 +0000 Subject: [PATCH 21/43] io_uring: add a {task,files} pair matching helper Add io_match_task() that matches both task and files. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index baff81313f54..35828af37065 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1039,6 +1039,26 @@ static inline void io_clean_op(struct io_kiocb *req) __io_clean_op(req); } +static bool io_match_task(struct io_kiocb *head, + struct task_struct *task, + struct files_struct *files) +{ + struct io_kiocb *req; + + if (task && head->task != task) + return false; + if (!files) + return true; + + io_for_each_link(req, head) { + if ((req->flags & REQ_F_WORK_INITIALIZED) && + (req->work.flags & IO_WQ_WORK_FILES) && + req->work.identity->files == files) + return true; + } + return false; +} + static void io_sq_thread_drop_mm_files(void) { struct files_struct *files = current->files; @@ -1687,27 +1707,6 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) } } -static inline bool __io_match_files(struct io_kiocb *req, - struct files_struct *files) -{ - return ((req->flags & REQ_F_WORK_INITIALIZED) && - (req->work.flags & IO_WQ_WORK_FILES)) && - req->work.identity->files == files; -} - -static bool io_match_files(struct io_kiocb *head, struct files_struct *files) -{ - struct io_kiocb *req; - - if (!files) - return true; - io_for_each_link(req, head) { - if (__io_match_files(req, files)) - return true; - } - return false; -} - /* Returns true if there are no backlogged entries after the flush */ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, struct task_struct *tsk, @@ -1735,9 +1734,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, cqe = NULL; list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { - if (tsk && req->task != tsk) - continue; - if (!io_match_files(req, files)) + if (!io_match_task(req, tsk, files)) continue; cqe = io_get_cqring(ctx); @@ -8787,8 +8784,7 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_task_match(de->req, task) && - io_match_files(de->req, files)) { + if (io_match_task(de->req, task, files)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } From df9923f96717d0aebb0a73adbcf6285fa79e38cb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 6 Nov 2020 13:00:23 +0000 Subject: [PATCH 22/43] io_uring: cancel only requests of current task io_uring_cancel_files() cancels all request that match files regardless of task. There is no real need in that, cancel only requests of the specified task. That also handles SQPOLL case as it already changes task to it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 35828af37065..c42fd31cb314 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8665,14 +8665,6 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } -static bool io_wq_files_match(struct io_wq_work *work, void *data) -{ - struct files_struct *files = data; - - return !files || ((work->flags & IO_WQ_WORK_FILES) && - work->identity->files == files); -} - /* * Returns true if 'preq' is the link parent of 'req' */ @@ -8805,21 +8797,20 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, * Returns true if we found and killed one or more files pinning requests */ static bool io_uring_cancel_files(struct io_ring_ctx *ctx, + struct task_struct *task, struct files_struct *files) { if (list_empty_careful(&ctx->inflight_list)) return false; - /* cancel all at once, should be faster than doing it one by one*/ - io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); - while (!list_empty_careful(&ctx->inflight_list)) { struct io_kiocb *cancel_req = NULL, *req; DEFINE_WAIT(wait); spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { - if (files && (req->work.flags & IO_WQ_WORK_FILES) && + if (req->task == task && + (req->work.flags & IO_WQ_WORK_FILES) && req->work.identity->files != files) continue; /* req is being completed, ignore */ @@ -8862,7 +8853,7 @@ static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, { bool ret; - ret = io_uring_cancel_files(ctx, files); + ret = io_uring_cancel_files(ctx, task, files); if (!files) { enum io_wq_cancel cret; @@ -8901,11 +8892,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, io_sq_thread_park(ctx->sq_data); } - if (files) - io_cancel_defer_files(ctx, NULL, files); - else - io_cancel_defer_files(ctx, task, NULL); - + io_cancel_defer_files(ctx, task, files); io_cqring_overflow_flush(ctx, true, task, files); while (__io_uring_cancel_task_requests(ctx, task, files)) { From b52fda00dd9df8b4a6de5784df94f9617f6133a1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 6 Nov 2020 13:00:24 +0000 Subject: [PATCH 23/43] io_uring: don't iterate io_uring_cancel_files() io_uring_cancel_files() guarantees to cancel all matching requests, that's not necessary to do that in a loop. Move it up in the callchain into io_uring_cancel_task_requests(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c42fd31cb314..364f96486e05 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8793,16 +8793,10 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, } } -/* - * Returns true if we found and killed one or more files pinning requests - */ -static bool io_uring_cancel_files(struct io_ring_ctx *ctx, +static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files) { - if (list_empty_careful(&ctx->inflight_list)) - return false; - while (!list_empty_careful(&ctx->inflight_list)) { struct io_kiocb *cancel_req = NULL, *req; DEFINE_WAIT(wait); @@ -8835,8 +8829,6 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx, schedule(); finish_wait(&ctx->inflight_wait, &wait); } - - return true; } static bool io_cancel_task_cb(struct io_wq_work *work, void *data) @@ -8847,15 +8839,12 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) return io_task_match(req, task); } -static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - struct files_struct *files) +static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, + struct task_struct *task) { - bool ret; - - ret = io_uring_cancel_files(ctx, task, files); - if (!files) { + while (1) { enum io_wq_cancel cret; + bool ret = false; cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true); if (cret != IO_WQ_CANCEL_NOTFOUND) @@ -8871,9 +8860,11 @@ static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, ret |= io_poll_remove_all(ctx, task); ret |= io_kill_timeouts(ctx, task); + if (!ret) + break; + io_run_task_work(); + cond_resched(); } - - return ret; } /* @@ -8894,11 +8885,10 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, io_cancel_defer_files(ctx, task, files); io_cqring_overflow_flush(ctx, true, task, files); + io_uring_cancel_files(ctx, task, files); - while (__io_uring_cancel_task_requests(ctx, task, files)) { - io_run_task_work(); - cond_resched(); - } + if (!files) + __io_uring_cancel_task_requests(ctx, task); if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { atomic_dec(&task->io_uring->in_idle); From 6b81928d4ca8668513251f9c04cdcb9d38ef51c7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 6 Nov 2020 13:00:25 +0000 Subject: [PATCH 24/43] io_uring: pass files into kill timeouts/poll Make io_poll_remove_all() and io_kill_timeouts() to match against files as well. A preparation patch, effectively not used by now. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 364f96486e05..a51dcd33a1b8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1589,14 +1589,15 @@ static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk) /* * Returns true if we found and killed one or more timeouts */ -static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk) +static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, + struct files_struct *files) { struct io_kiocb *req, *tmp; int canceled = 0; spin_lock_irq(&ctx->completion_lock); list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { - if (io_task_match(req, tsk)) { + if (io_match_task(req, tsk, files)) { io_kill_timeout(req); canceled++; } @@ -5473,7 +5474,8 @@ static bool io_poll_remove_one(struct io_kiocb *req) /* * Returns true if we found and killed one or more poll requests */ -static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk) +static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, + struct files_struct *files) { struct hlist_node *tmp; struct io_kiocb *req; @@ -5485,7 +5487,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk) list = &ctx->cancel_hash[i]; hlist_for_each_entry_safe(req, tmp, list, hash_node) { - if (io_task_match(req, tsk)) + if (io_match_task(req, tsk, files)) posted += io_poll_remove_one(req); } } @@ -8626,8 +8628,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); - io_kill_timeouts(ctx, NULL); - io_poll_remove_all(ctx, NULL); + io_kill_timeouts(ctx, NULL, NULL); + io_poll_remove_all(ctx, NULL, NULL); if (ctx->io_wq) io_wq_cancel_all(ctx->io_wq); @@ -8858,8 +8860,8 @@ static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, } } - ret |= io_poll_remove_all(ctx, task); - ret |= io_kill_timeouts(ctx, task); + ret |= io_poll_remove_all(ctx, task, NULL); + ret |= io_kill_timeouts(ctx, task, NULL); if (!ret) break; io_run_task_work(); From f6edbabb8359798c541b0776616c5eab3a840d3d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 6 Nov 2020 13:00:26 +0000 Subject: [PATCH 25/43] io_uring: always batch cancel in *cancel_files() Instead of iterating over each request and cancelling it individually in io_uring_cancel_files(), try to cancel all matching requests and use ->inflight_list only to check if there anything left. In many cases it should be faster, and we can reuse a lot of code from task cancellation. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 10 ---- fs/io-wq.h | 1 - fs/io_uring.c | 135 ++++++++------------------------------------------ 3 files changed, 21 insertions(+), 125 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index b53c055bea6a..f72d53848dcb 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -1078,16 +1078,6 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, return IO_WQ_CANCEL_NOTFOUND; } -static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) -{ - return work == data; -} - -enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) -{ - return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false); -} - struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { int ret = -ENOMEM, node; diff --git a/fs/io-wq.h b/fs/io-wq.h index cba36f03c355..069496c6d4f9 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -129,7 +129,6 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work) } void io_wq_cancel_all(struct io_wq *wq); -enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); typedef bool (work_cancel_fn)(struct io_wq_work *, void *); diff --git a/fs/io_uring.c b/fs/io_uring.c index a51dcd33a1b8..733ba91e0205 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1577,15 +1577,6 @@ static void io_kill_timeout(struct io_kiocb *req) } } -static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk) -{ - struct io_ring_ctx *ctx = req->ctx; - - if (!tsk || req->task == tsk) - return true; - return (ctx->flags & IORING_SETUP_SQPOLL); -} - /* * Returns true if we found and killed one or more timeouts */ @@ -8667,108 +8658,31 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } -/* - * Returns true if 'preq' is the link parent of 'req' - */ -static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) -{ - struct io_kiocb *link; +struct io_task_cancel { + struct task_struct *task; + struct files_struct *files; +}; - io_for_each_link(link, preq->link) { - if (link == req) - return true; - } - return false; -} - -/* - * We're looking to cancel 'req' because it's holding on to our files, but - * 'req' could be a link to another request. See if it is, and cancel that - * parent request if so. - */ -static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req) -{ - struct hlist_node *tmp; - struct io_kiocb *preq; - bool found = false; - int i; - - spin_lock_irq(&ctx->completion_lock); - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; - - list = &ctx->cancel_hash[i]; - hlist_for_each_entry_safe(preq, tmp, list, hash_node) { - found = io_match_link(preq, req); - if (found) { - io_poll_remove_one(preq); - break; - } - } - } - spin_unlock_irq(&ctx->completion_lock); - return found; -} - -static bool io_timeout_remove_link(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - struct io_kiocb *preq; - bool found = false; - - spin_lock_irq(&ctx->completion_lock); - list_for_each_entry(preq, &ctx->timeout_list, timeout.list) { - found = io_match_link(preq, req); - if (found) { - __io_timeout_cancel(preq); - break; - } - } - spin_unlock_irq(&ctx->completion_lock); - return found; -} - -static bool io_cancel_link_cb(struct io_wq_work *work, void *data) +static bool io_cancel_task_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_task_cancel *cancel = data; bool ret; - if (req->flags & REQ_F_LINK_TIMEOUT) { + if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) { unsigned long flags; struct io_ring_ctx *ctx = req->ctx; /* protect against races with linked timeouts */ spin_lock_irqsave(&ctx->completion_lock, flags); - ret = io_match_link(req, data); + ret = io_match_task(req, cancel->task, cancel->files); spin_unlock_irqrestore(&ctx->completion_lock, flags); } else { - ret = io_match_link(req, data); + ret = io_match_task(req, cancel->task, cancel->files); } return ret; } -static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) -{ - enum io_wq_cancel cret; - - /* cancel this particular work, if it's running */ - cret = io_wq_cancel_work(ctx->io_wq, &req->work); - if (cret != IO_WQ_CANCEL_NOTFOUND) - return; - - /* find links that hold this pending, cancel those */ - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true); - if (cret != IO_WQ_CANCEL_NOTFOUND) - return; - - /* if we have a poll link holding this pending, cancel that */ - if (io_poll_remove_link(ctx, req)) - return; - - /* final option, timeout link is holding this req pending */ - io_timeout_remove_link(ctx, req); -} - static void io_cancel_defer_files(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files) @@ -8800,8 +8714,10 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { while (!list_empty_careful(&ctx->inflight_list)) { - struct io_kiocb *cancel_req = NULL, *req; + struct io_task_cancel cancel = { .task = task, .files = NULL, }; + struct io_kiocb *req; DEFINE_WAIT(wait); + bool found = false; spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { @@ -8809,23 +8725,21 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, (req->work.flags & IO_WQ_WORK_FILES) && req->work.identity->files != files) continue; - /* req is being completed, ignore */ - if (!refcount_inc_not_zero(&req->refs)) - continue; - cancel_req = req; + found = true; break; } - if (cancel_req) + if (found) prepare_to_wait(&ctx->inflight_wait, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_irq(&ctx->inflight_lock); /* We need to keep going until we don't find a matching req */ - if (!cancel_req) + if (!found) break; - /* cancel this request, or head link requests */ - io_attempt_cancel(ctx, cancel_req); - io_put_req(cancel_req); + + io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); + io_poll_remove_all(ctx, task, files); + io_kill_timeouts(ctx, task, files); /* cancellations _may_ trigger task work */ io_run_task_work(); schedule(); @@ -8833,22 +8747,15 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } } -static bool io_cancel_task_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct task_struct *task = data; - - return io_task_match(req, task); -} - static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, struct task_struct *task) { while (1) { + struct io_task_cancel cancel = { .task = task, .files = NULL, }; enum io_wq_cancel cret; bool ret = false; - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true); + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); if (cret != IO_WQ_CANCEL_NOTFOUND) ret = true; From 08369246344077a9cf8109c1cf92a640733314f2 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Tue, 3 Nov 2020 14:15:59 +0800 Subject: [PATCH 26/43] io_uring: refactor io_sq_thread() handling There are some issues about current io_sq_thread() implementation: 1. The prepare_to_wait() usage in __io_sq_thread() is weird. If multiple ctxs share one same poll thread, one ctx will put poll thread in TASK_INTERRUPTIBLE, but if other ctxs have work to do, we don't need to change task's stat at all. I think only if all ctxs don't have work to do, we can do it. 2. We use round-robin strategy to make multiple ctxs share one same poll thread, but there are various condition in __io_sq_thread(), which seems complicated and may affect round-robin strategy. To improve above issues, I take below actions: 1. If multiple ctxs share one same poll thread, only if all all ctxs don't have work to do, we can call prepare_to_wait() and schedule() to make poll thread enter sleep state. 2. To make round-robin strategy more straight, I simplify __io_sq_thread() a bit, it just does io poll and sqes submit work once, does not check various condition. 3. For multiple ctxs share one same poll thread, we choose the biggest sq_thread_idle among these ctxs as timeout condition, and will update it when ctx is in or out. 4. Not need to check EBUSY especially, if io_submit_sqes() returns EBUSY, IORING_SQ_CQ_OVERFLOW should be set, helper in liburing should be aware of cq overflow and enters kernel to flush work. Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 169 ++++++++++++++++++++------------------------------ 1 file changed, 67 insertions(+), 102 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 733ba91e0205..b789b9af2f4c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -245,6 +245,8 @@ struct io_sq_data { struct task_struct *thread; struct wait_queue_head wait; + + unsigned sq_thread_idle; }; struct io_ring_ctx { @@ -310,7 +312,6 @@ struct io_ring_ctx { struct io_sq_data *sq_data; /* if using sq thread polling */ struct wait_queue_head sqo_sq_wait; - struct wait_queue_entry sqo_wait_entry; struct list_head sqd_list; /* @@ -6841,111 +6842,49 @@ static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->completion_lock); } -static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode, - int sync, void *key) +static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) { - struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry); - int ret; - - ret = autoremove_wake_function(wqe, mode, sync, key); - if (ret) { - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } - return ret; -} - -enum sq_ret { - SQT_IDLE = 1, - SQT_SPIN = 2, - SQT_DID_WORK = 4, -}; - -static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx, - unsigned long start_jiffies, bool cap_entries) -{ - unsigned long timeout = start_jiffies + ctx->sq_thread_idle; - struct io_sq_data *sqd = ctx->sq_data; unsigned int to_submit; int ret = 0; -again: if (!list_empty(&ctx->iopoll_list)) { unsigned nr_events = 0; mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->iopoll_list) && !need_resched()) + if (!list_empty(&ctx->iopoll_list)) io_do_iopoll(ctx, &nr_events, 0); mutex_unlock(&ctx->uring_lock); } to_submit = io_sqring_entries(ctx); - - /* - * If submit got -EBUSY, flag us as needing the application - * to enter the kernel to reap and flush events. - */ - if (!to_submit || ret == -EBUSY || need_resched()) { - /* - * Drop cur_mm before scheduling, we can't hold it for - * long periods (or over schedule()). Do this before - * adding ourselves to the waitqueue, as the unuse/drop - * may sleep. - */ - io_sq_thread_drop_mm_files(); - - /* - * We're polling. If we're within the defined idle - * period, then let us spin without work before going - * to sleep. The exception is if we got EBUSY doing - * more IO, we should wait for the application to - * reap events and wake us up. - */ - if (!list_empty(&ctx->iopoll_list) || need_resched() || - (!time_after(jiffies, timeout) && ret != -EBUSY && - !percpu_ref_is_dying(&ctx->refs))) - return SQT_SPIN; - - prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry, - TASK_INTERRUPTIBLE); - - /* - * While doing polled IO, before going to sleep, we need - * to check if there are new reqs added to iopoll_list, - * it is because reqs may have been punted to io worker - * and will be added to iopoll_list later, hence check - * the iopoll_list again. - */ - if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->iopoll_list)) { - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); - goto again; - } - - to_submit = io_sqring_entries(ctx); - if (!to_submit || ret == -EBUSY) - return SQT_IDLE; - } - - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); - io_ring_clear_wakeup_flag(ctx); - /* if we're handling multiple rings, cap submit size for fairness */ if (cap_entries && to_submit > 8) to_submit = 8; - mutex_lock(&ctx->uring_lock); - if (likely(!percpu_ref_is_dying(&ctx->refs))) - ret = io_submit_sqes(ctx, to_submit); - mutex_unlock(&ctx->uring_lock); + if (to_submit) { + mutex_lock(&ctx->uring_lock); + if (likely(!percpu_ref_is_dying(&ctx->refs))) + ret = io_submit_sqes(ctx, to_submit); + mutex_unlock(&ctx->uring_lock); + } if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) wake_up(&ctx->sqo_sq_wait); - return SQT_DID_WORK; + return ret; +} + +static void io_sqd_update_thread_idle(struct io_sq_data *sqd) +{ + struct io_ring_ctx *ctx; + unsigned sq_thread_idle = 0; + + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + if (sq_thread_idle < ctx->sq_thread_idle) + sq_thread_idle = ctx->sq_thread_idle; + } + + sqd->sq_thread_idle = sq_thread_idle; } static void io_sqd_init_new(struct io_sq_data *sqd) @@ -6954,11 +6893,11 @@ static void io_sqd_init_new(struct io_sq_data *sqd) while (!list_empty(&sqd->ctx_new_list)) { ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list); - init_wait(&ctx->sqo_wait_entry); - ctx->sqo_wait_entry.func = io_sq_wake_function; list_move_tail(&ctx->sqd_list, &sqd->ctx_list); complete(&ctx->sq_thread_comp); } + + io_sqd_update_thread_idle(sqd); } static int io_sq_thread(void *data) @@ -6969,17 +6908,17 @@ static int io_sq_thread(void *data) const struct cred *old_cred = NULL; struct io_sq_data *sqd = data; struct io_ring_ctx *ctx; - unsigned long start_jiffies; + unsigned long timeout; + DEFINE_WAIT(wait); task_lock(current); current->files = NULL; current->nsproxy = NULL; task_unlock(current); - start_jiffies = jiffies; while (!kthread_should_stop()) { - enum sq_ret ret = 0; - bool cap_entries; + int ret; + bool cap_entries, sqt_spin, needs_sched; /* * Any changes to the sqd lists are synchronized through the @@ -6989,11 +6928,13 @@ static int io_sq_thread(void *data) if (kthread_should_park()) kthread_parkme(); - if (unlikely(!list_empty(&sqd->ctx_new_list))) + if (unlikely(!list_empty(&sqd->ctx_new_list))) { io_sqd_init_new(sqd); + timeout = jiffies + sqd->sq_thread_idle; + } + sqt_spin = false; cap_entries = !list_is_singular(&sqd->ctx_list); - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { if (current->cred != ctx->creds) { if (old_cred) @@ -7006,24 +6947,49 @@ static int io_sq_thread(void *data) current->sessionid = ctx->sessionid; #endif - ret |= __io_sq_thread(ctx, start_jiffies, cap_entries); + ret = __io_sq_thread(ctx, cap_entries); + if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) + sqt_spin = true; io_sq_thread_drop_mm_files(); } - if (ret & SQT_SPIN) { + if (sqt_spin || !time_after(jiffies, timeout)) { io_run_task_work(); cond_resched(); - } else if (ret == SQT_IDLE) { - if (kthread_should_park()) - continue; + if (sqt_spin) + timeout = jiffies + sqd->sq_thread_idle; + continue; + } + + if (kthread_should_park()) + continue; + + needs_sched = true; + prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + if ((ctx->flags & IORING_SETUP_IOPOLL) && + !list_empty_careful(&ctx->iopoll_list)) { + needs_sched = false; + break; + } + if (io_sqring_entries(ctx)) { + needs_sched = false; + break; + } + } + + if (needs_sched) { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); + schedule(); - start_jiffies = jiffies; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); } + + finish_wait(&sqd->wait, &wait); + timeout = jiffies + sqd->sq_thread_idle; } io_run_task_work(); @@ -7335,12 +7301,11 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx) mutex_lock(&sqd->ctx_lock); list_del(&ctx->sqd_list); + io_sqd_update_thread_idle(sqd); mutex_unlock(&sqd->ctx_lock); - if (sqd->thread) { - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); + if (sqd->thread) io_sq_thread_unpark(sqd); - } io_put_sq_data(sqd); ctx->sq_data = NULL; From a0d9205f7d36bf72279f34a93850fd14789fdc7e Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 12 Nov 2020 14:55:59 +0800 Subject: [PATCH 27/43] io_uring: initialize 'timeout' properly in io_sq_thread() Some static checker reports below warning: fs/io_uring.c:6939 io_sq_thread() error: uninitialized symbol 'timeout'. This is a false positive, but let's just initialize 'timeout' to make sure we don't trip over this. Reported-by: Dan Carpenter Signed-off-by: Xiaoguang Wang Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b789b9af2f4c..d52d6f529dc6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6908,7 +6908,7 @@ static int io_sq_thread(void *data) const struct cred *old_cred = NULL; struct io_sq_data *sqd = data; struct io_ring_ctx *ctx; - unsigned long timeout; + unsigned long timeout = 0; DEFINE_WAIT(wait); task_lock(current); From 906a3c6f9ca072e917c701f7421647e169740954 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 12 Nov 2020 14:56:00 +0800 Subject: [PATCH 28/43] io_uring: don't acquire uring_lock twice Both IOPOLL and sqes handling need to acquire uring_lock, combine them together, then we just need to acquire uring_lock once. Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d52d6f529dc6..67bf9047d230 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6847,23 +6847,19 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) unsigned int to_submit; int ret = 0; - if (!list_empty(&ctx->iopoll_list)) { - unsigned nr_events = 0; - - mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->iopoll_list)) - io_do_iopoll(ctx, &nr_events, 0); - mutex_unlock(&ctx->uring_lock); - } - to_submit = io_sqring_entries(ctx); /* if we're handling multiple rings, cap submit size for fairness */ if (cap_entries && to_submit > 8) to_submit = 8; - if (to_submit) { + if (!list_empty(&ctx->iopoll_list) || to_submit) { + unsigned nr_events = 0; + mutex_lock(&ctx->uring_lock); - if (likely(!percpu_ref_is_dying(&ctx->refs))) + if (!list_empty(&ctx->iopoll_list)) + io_do_iopoll(ctx, &nr_events, 0); + + if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs))) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); } From 2e9dbe902d1020ef70f968e8675c8d2457c4ffaa Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Fri, 13 Nov 2020 00:44:08 +0800 Subject: [PATCH 29/43] io_uring: only wake up sq thread while current task is in io worker context If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread task context or in io worker task context. If current task context is sq thread, we don't need to check whether should wake up sq thread. io_iopoll_req_issued() calls wq_has_sleeper(), which has smp_mb() memory barrier, before this patch, perf shows obvious overhead: Samples: 481K of event 'cycles', Event count (approx.): 299807382878 Overhead Comma Shared Object Symbol 3.69% :9630 [kernel.vmlinux] [k] io_issue_sqe With this patch, perf shows: Samples: 482K of event 'cycles', Event count (approx.): 299929547283 Overhead Comma Shared Object Symbol 0.70% :4015 [kernel.vmlinux] [k] io_issue_sqe It shows some obvious improvements. Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 67bf9047d230..3617bde95de2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2712,7 +2712,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) * find it from a io_iopoll_getevents() thread before the issuer is done * accessing the kiocb cookie. */ -static void io_iopoll_req_issued(struct io_kiocb *req) +static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) { struct io_ring_ctx *ctx = req->ctx; @@ -2741,7 +2741,12 @@ static void io_iopoll_req_issued(struct io_kiocb *req) else list_add_tail(&req->inflight_entry, &ctx->iopoll_list); - if ((ctx->flags & IORING_SETUP_SQPOLL) && + /* + * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread + * task context or in io worker task context. If current task context is + * sq thread, we don't need to check whether should wake up sq thread. + */ + if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); } @@ -6241,7 +6246,7 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, if (in_async) mutex_lock(&ctx->uring_lock); - io_iopoll_req_issued(req); + io_iopoll_req_issued(req, in_async); if (in_async) mutex_unlock(&ctx->uring_lock); From 10cad2c40dcb04bb46b2bf399e00ca5ea93d36b0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 7 Nov 2020 13:20:39 +0000 Subject: [PATCH 30/43] io_uring: don't take fs for recvmsg/sendmsg We don't even allow not plain data msg_control, which is disallowed in __sys_{send,revb}msg_sock(). So no need in fs for IORING_OP_SENDMSG and IORING_OP_RECVMSG. fs->lock is less contanged not as much as before, but there are cases that can be, e.g. IOSQE_ASYNC. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3617bde95de2..33c448776731 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -852,8 +852,7 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_msghdr), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_RECVMSG] = { .needs_file = 1, @@ -862,8 +861,7 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_msghdr), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_TIMEOUT] = { .needs_async_data = 1, From c98de08c990e190fc7cc3aaf8079b4a0674c6425 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Nov 2020 12:56:32 +0000 Subject: [PATCH 31/43] io_uring: replace inflight_wait with tctx->wait As tasks now cancel only theirs requests, and inflight_wait is awaited only in io_uring_cancel_files(), which should be called with ->in_idle set, instead of keeping a separate inflight_wait use tctx->wait. That will add some spurious wakeups but actually is safer from point of not hanging the task. e.g. task1 | IRQ | *start* io_complete_rw_common(link) | link: req1 -> req2 -> req3(with files) *cancel_files() | io_wq_cancel(), etc. | | put_req(link), adds to io-wq req2 schedule() | So, task1 will never try to cancel req2 or req3. If req2 is long-standing (e.g. read(empty_pipe)), this may hang. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 33c448776731..c42321ecda5d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -287,7 +287,6 @@ struct io_ring_ctx { struct list_head timeout_list; struct list_head cq_overflow_list; - wait_queue_head_t inflight_wait; struct io_uring_sqe *sq_sqes; } ____cacheline_aligned_in_smp; @@ -1291,7 +1290,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); - init_waitqueue_head(&ctx->inflight_wait); spin_lock_init(&ctx->inflight_lock); INIT_LIST_HEAD(&ctx->inflight_list); INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work); @@ -6046,12 +6044,13 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) static void io_req_drop_files(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + struct io_uring_task *tctx = req->task->io_uring; unsigned long flags; spin_lock_irqsave(&ctx->inflight_lock, flags); list_del(&req->inflight_entry); - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); + if (atomic_read(&tctx->in_idle)) + wake_up(&tctx->wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); req->flags &= ~REQ_F_INFLIGHT; put_files_struct(req->work.identity->files); @@ -8693,8 +8692,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, break; } if (found) - prepare_to_wait(&ctx->inflight_wait, &wait, - TASK_UNINTERRUPTIBLE); + prepare_to_wait(&task->io_uring->wait, &wait, + TASK_UNINTERRUPTIBLE); spin_unlock_irq(&ctx->inflight_lock); /* We need to keep going until we don't find a matching req */ @@ -8707,7 +8706,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, /* cancellations _may_ trigger task work */ io_run_task_work(); schedule(); - finish_wait(&ctx->inflight_wait, &wait); + finish_wait(&task->io_uring->wait, &wait); } } From 36f72fe2792c4304f1203a44a6a7178e49b447f7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 18 Nov 2020 19:57:26 +0000 Subject: [PATCH 32/43] io_uring: share fixed_file_refs b/w multiple rsrcs Double fixed files for splice/tee are done in a nasty way, it takes 2 ref_node refs, and during the second time it blindly overrides req->fixed_file_refs hoping that it haven't changed. That works because all that is done under iouring_lock in a single go but is error-prone. Bind everything explicitly to a single ref_node and take only one ref, with current ref_node ordering it's guaranteed to keep all files valid awhile the request is inflight. That's mainly a cleanup + preparation for generic resource handling, but also saves pcpu_ref get/put for splice/tee with 2 fixed files. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c42321ecda5d..5818a7b3d29b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1037,6 +1037,16 @@ static inline void io_clean_op(struct io_kiocb *req) __io_clean_op(req); } +static inline void io_set_resource_node(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!req->fixed_file_refs) { + req->fixed_file_refs = &ctx->file_data->node->refs; + percpu_ref_get(req->fixed_file_refs); + } +} + static bool io_match_task(struct io_kiocb *head, struct task_struct *task, struct files_struct *files) @@ -1919,9 +1929,7 @@ fallback: static inline void io_put_file(struct io_kiocb *req, struct file *file, bool fixed) { - if (fixed) - percpu_ref_put(req->fixed_file_refs); - else + if (!fixed) fput(file); } @@ -1933,7 +1941,8 @@ static void io_dismantle_req(struct io_kiocb *req) kfree(req->async_data); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - + if (req->fixed_file_refs) + percpu_ref_put(req->fixed_file_refs); io_req_clean_work(req); } @@ -6310,10 +6319,7 @@ static struct file *io_file_get(struct io_submit_state *state, return NULL; fd = array_index_nospec(fd, ctx->nr_user_files); file = io_file_from_index(ctx, fd); - if (file) { - req->fixed_file_refs = &ctx->file_data->node->refs; - percpu_ref_get(req->fixed_file_refs); - } + io_set_resource_node(req); } else { trace_io_uring_file_get(ctx, fd); file = __io_file_get(state, fd); @@ -6691,6 +6697,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->ctx = ctx; req->flags = 0; req->link = NULL; + req->fixed_file_refs = NULL; /* one is dropped after submission, the other at completion */ refcount_set(&req->refs, 2); req->task = current; From 65b2b213484acd89a3c20dbb524e52a2f3793b78 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 19 Nov 2020 17:44:46 +0800 Subject: [PATCH 33/43] io_uring: check kthread stopped flag when sq thread is unparked syzbot reports following issue: INFO: task syz-executor.2:12399 can't die for more than 143 seconds. task:syz-executor.2 state:D stack:28744 pid:12399 ppid: 8504 flags:0x00004004 Call Trace: context_switch kernel/sched/core.c:3773 [inline] __schedule+0x893/0x2170 kernel/sched/core.c:4522 schedule+0xcf/0x270 kernel/sched/core.c:4600 schedule_timeout+0x1d8/0x250 kernel/time/timer.c:1847 do_wait_for_common kernel/sched/completion.c:85 [inline] __wait_for_common kernel/sched/completion.c:106 [inline] wait_for_common kernel/sched/completion.c:117 [inline] wait_for_completion+0x163/0x260 kernel/sched/completion.c:138 kthread_stop+0x17a/0x720 kernel/kthread.c:596 io_put_sq_data fs/io_uring.c:7193 [inline] io_sq_thread_stop+0x452/0x570 fs/io_uring.c:7290 io_finish_async fs/io_uring.c:7297 [inline] io_sq_offload_create fs/io_uring.c:8015 [inline] io_uring_create fs/io_uring.c:9433 [inline] io_uring_setup+0x19b7/0x3730 fs/io_uring.c:9507 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x45deb9 Code: Unable to access opcode bytes at RIP 0x45de8f. RSP: 002b:00007f174e51ac78 EFLAGS: 00000246 ORIG_RAX: 00000000000001a9 RAX: ffffffffffffffda RBX: 0000000000008640 RCX: 000000000045deb9 RDX: 0000000000000000 RSI: 0000000020000140 RDI: 00000000000050e5 RBP: 000000000118bf58 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000118bf2c R13: 00007ffed9ca723f R14: 00007f174e51b9c0 R15: 000000000118bf2c INFO: task syz-executor.2:12399 blocked for more than 143 seconds. Not tainted 5.10.0-rc3-next-20201110-syzkaller #0 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. Currently we don't have a reproducer yet, but seems that there is a race in current codes: => io_put_sq_data ctx_list is empty now. | ==> kthread_park(sqd->thread); | | T1: sq thread is parked now. ==> kthread_stop(sqd->thread); | KTHREAD_SHOULD_STOP is set now.| ===> kthread_unpark(k); | | T2: sq thread is now unparkd, run again. | | T3: sq thread is now preempted out. | ===> wake_up_process(k); | | | T4: Since sqd ctx_list is empty, needs_sched will be true, | then sq thread sets task state to TASK_INTERRUPTIBLE, | and schedule, now sq thread will never be waken up. ===> wait_for_completion | I have artificially used mdelay() to simulate above race, will get same stack like this syzbot report, but to be honest, I'm not sure this code race triggers syzbot report. To fix this possible code race, when sq thread is unparked, need to check whether sq thread has been stopped. Reported-by: syzbot+03beeb595f074db9cfd1@syzkaller.appspotmail.com Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5818a7b3d29b..6f18b5f27404 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6930,8 +6930,16 @@ static int io_sq_thread(void *data) * kthread parking. This synchronizes the thread vs users, * the users are synchronized on the sqd->ctx_lock. */ - if (kthread_should_park()) + if (kthread_should_park()) { kthread_parkme(); + /* + * When sq thread is unparked, in case the previous park operation + * comes from io_put_sq_data(), which means that sq thread is going + * to be stopped, so here needs to have a check. + */ + if (kthread_should_stop()) + break; + } if (unlikely(!list_empty(&sqd->ctx_new_list))) { io_sqd_init_new(sqd); From 6e1271e60c1d5e822fd1a32a56d52d9ae1823e62 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 20 Nov 2020 15:50:50 +0000 Subject: [PATCH 34/43] io_uring: change submit file state invariant Keep submit state invariant of whether there are file refs left based on state->nr_refs instead of (state->file==NULL), and always check against the first one. It's easier to track and allows to remove 1 if. It also automatically leaves struct submit_state in a consistent state after io_submit_state_end(), that's not used yet but nice. btw rename has_refs to file_refs for more clarity. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6f18b5f27404..a83593cadcc9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -762,7 +762,7 @@ struct io_submit_state { */ struct file *file; unsigned int fd; - unsigned int has_refs; + unsigned int file_refs; unsigned int ios_left; }; @@ -2756,16 +2756,15 @@ static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) wake_up(&ctx->sq_data->wait); } -static void __io_state_file_put(struct io_submit_state *state) +static inline void __io_state_file_put(struct io_submit_state *state) { - if (state->has_refs) - fput_many(state->file, state->has_refs); - state->file = NULL; + fput_many(state->file, state->file_refs); + state->file_refs = 0; } static inline void io_state_file_put(struct io_submit_state *state) { - if (state->file) + if (state->file_refs) __io_state_file_put(state); } @@ -2779,19 +2778,19 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) if (!state) return fget(fd); - if (state->file) { + if (state->file_refs) { if (state->fd == fd) { - state->has_refs--; + state->file_refs--; return state->file; } __io_state_file_put(state); } state->file = fget_many(fd, state->ios_left); - if (!state->file) + if (unlikely(!state->file)) return NULL; state->fd = fd; - state->has_refs = state->ios_left - 1; + state->file_refs = state->ios_left - 1; return state->file; } @@ -6601,7 +6600,7 @@ static void io_submit_state_start(struct io_submit_state *state, INIT_LIST_HEAD(&state->comp.list); state->comp.ctx = ctx; state->free_reqs = 0; - state->file = NULL; + state->file_refs = 0; state->ios_left = max_ios; } From bd5bbda72f7fa013ddea0ff7c4d91daedb821869 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 20 Nov 2020 15:50:51 +0000 Subject: [PATCH 35/43] io_uring: fix miscounting ios_left io_req_init() doesn't decrement state->ios_left if a request doesn't need ->file, it just returns before that on if(!needs_file). That's not really a problem but may cause overhead for an additional fput(). Also inline and kill io_req_set_file() as it's of no use anymore. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a83593cadcc9..91536947ef08 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6327,15 +6327,6 @@ static struct file *io_file_get(struct io_submit_state *state, return file; } -static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, - int fd) -{ - req->file = io_file_get(state, req, fd, req->flags & REQ_F_FIXED_FILE); - if (req->file || io_op_defs[req->opcode].needs_file_no_error) - return 0; - return -EBADF; -} - static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -6748,10 +6739,16 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, state->plug_started = true; } - if (!io_op_defs[req->opcode].needs_file) - return 0; + ret = 0; + if (io_op_defs[req->opcode].needs_file) { + bool fixed = req->flags & REQ_F_FIXED_FILE; + + req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); + if (unlikely(!req->file && + !io_op_defs[req->opcode].needs_file_no_error)) + ret = -EBADF; + } - ret = io_req_set_file(state, req, READ_ONCE(sqe->fd)); state->ios_left--; return ret; } From ac0648a56c1ff66c1cbf735075ad33a26cbc50de Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 Nov 2020 09:37:51 -0700 Subject: [PATCH 36/43] io_uring: use bottom half safe lock for fixed file data io_file_data_ref_zero() can be invoked from soft-irq from the RCU core, hence we need to ensure that the file_data lock is bottom half safe. Use the _bh() variants when grabbing this lock. Reported-by: syzbot+1f4ba1e5520762c523c6@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 91536947ef08..e66888d45778 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7185,9 +7185,9 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) if (!data) return -ENXIO; - spin_lock(&data->lock); + spin_lock_bh(&data->lock); ref_node = data->node; - spin_unlock(&data->lock); + spin_unlock_bh(&data->lock); if (ref_node) percpu_ref_kill(&ref_node->refs); @@ -7569,7 +7569,7 @@ static void io_file_data_ref_zero(struct percpu_ref *ref) data = ref_node->file_data; ctx = data->ctx; - spin_lock(&data->lock); + spin_lock_bh(&data->lock); ref_node->done = true; while (!list_empty(&data->ref_list)) { @@ -7581,7 +7581,7 @@ static void io_file_data_ref_zero(struct percpu_ref *ref) list_del(&ref_node->node); first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist); } - spin_unlock(&data->lock); + spin_unlock_bh(&data->lock); if (percpu_ref_is_dying(&data->refs)) delay = 0; @@ -7704,9 +7704,9 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, } file_data->node = ref_node; - spin_lock(&file_data->lock); + spin_lock_bh(&file_data->lock); list_add_tail(&ref_node->node, &file_data->ref_list); - spin_unlock(&file_data->lock); + spin_unlock_bh(&file_data->lock); percpu_ref_get(&file_data->refs); return ret; out_fput: @@ -7863,10 +7863,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (needs_switch) { percpu_ref_kill(&data->node->refs); - spin_lock(&data->lock); + spin_lock_bh(&data->lock); list_add_tail(&ref_node->node, &data->ref_list); data->node = ref_node; - spin_unlock(&data->lock); + spin_unlock_bh(&data->lock); percpu_ref_get(&ctx->file_data->refs); } else destroy_fixed_file_ref_node(ref_node); From bee749b187ac57d1faf00b2ab356ff322230fce8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 25 Nov 2020 02:19:23 +0000 Subject: [PATCH 37/43] io_uring: fix files cancellation io_uring_cancel_files()'s task check condition mistakenly got flipped. 1. There can't be a request in the inflight list without IO_WQ_WORK_FILES, kill this check to keep the whole condition simpler. 2. Also, don't call the function for files==NULL to not do such a check, all that staff is already handled well by its counter part, __io_uring_cancel_task_requests(). With that just flip the task check. Also, it iowq-cancels all request of current task there, don't forget to set right ->files into struct io_task_cancel. Fixes: c1973b38bf639 ("io_uring: cancel only requests of current task") Reported-by: syzbot+c0d52d0b3c0c3ffb9525@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e66888d45778..f47de27e5125 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8688,15 +8688,14 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { while (!list_empty_careful(&ctx->inflight_list)) { - struct io_task_cancel cancel = { .task = task, .files = NULL, }; + struct io_task_cancel cancel = { .task = task, .files = files }; struct io_kiocb *req; DEFINE_WAIT(wait); bool found = false; spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { - if (req->task == task && - (req->work.flags & IO_WQ_WORK_FILES) && + if (req->task != task || req->work.identity->files != files) continue; found = true; @@ -8768,10 +8767,11 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, io_cancel_defer_files(ctx, task, files); io_cqring_overflow_flush(ctx, true, task, files); - io_uring_cancel_files(ctx, task, files); if (!files) __io_uring_cancel_task_requests(ctx, task); + else + io_uring_cancel_files(ctx, task, files); if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { atomic_dec(&task->io_uring->in_idle); From fbd15848f3c13506253b6c5de0077a603947cb67 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Nov 2020 19:11:15 +0000 Subject: [PATCH 38/43] io_uring: restructure io_timeout_cancel() Add io_timeout_extract() helper, which searches and disarms timeouts, but doesn't complete them. No functional changes. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f47de27e5125..3930b11dcd58 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5633,24 +5633,10 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static int __io_timeout_cancel(struct io_kiocb *req) -{ - struct io_timeout_data *io = req->async_data; - int ret; - - ret = hrtimer_try_to_cancel(&io->timer); - if (ret == -1) - return -EALREADY; - list_del_init(&req->timeout.list); - - req_set_fail_links(req); - io_cqring_fill_event(req, -ECANCELED); - io_put_req_deferred(req, 1); - return 0; -} - -static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) +static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, + __u64 user_data) { + struct io_timeout_data *io; struct io_kiocb *req; int ret = -ENOENT; @@ -5662,9 +5648,27 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) } if (ret == -ENOENT) - return ret; + return ERR_PTR(ret); - return __io_timeout_cancel(req); + io = req->async_data; + ret = hrtimer_try_to_cancel(&io->timer); + if (ret == -1) + return ERR_PTR(-EALREADY); + list_del_init(&req->timeout.list); + return req; +} + +static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) +{ + struct io_kiocb *req = io_timeout_extract(ctx, user_data); + + if (IS_ERR(req)) + return PTR_ERR(req); + + req_set_fail_links(req); + io_cqring_fill_event(req, -ECANCELED); + io_put_req_deferred(req, 1); + return 0; } static int io_timeout_remove_prep(struct io_kiocb *req, From 9c8e11b36c9b640a85a4a33a9e9dff418993cc34 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Nov 2020 19:11:16 +0000 Subject: [PATCH 39/43] io_uring: add timeout update Support timeout updates through IORING_OP_TIMEOUT_REMOVE with passed in IORING_TIMEOUT_UPDATE. Updates doesn't support offset timeout mode. Oirignal timeout.off will be ignored as well. Signed-off-by: Pavel Begunkov [axboe: remove now unused 'ret' variable] Signed-off-by: Jens Axboe --- fs/io_uring.c | 54 ++++++++++++++++++++++++++++++++--- include/uapi/linux/io_uring.h | 1 + 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3930b11dcd58..b40083cde733 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -453,6 +453,10 @@ struct io_timeout { struct io_timeout_rem { struct file *file; u64 addr; + + /* timeout update */ + struct timespec64 ts; + u32 flags; }; struct io_rw { @@ -867,7 +871,10 @@ static const struct io_op_def io_op_defs[] = { .async_size = sizeof(struct io_timeout_data), .work_flags = IO_WQ_WORK_MM, }, - [IORING_OP_TIMEOUT_REMOVE] = {}, + [IORING_OP_TIMEOUT_REMOVE] = { + /* used by timeout updates' prep() */ + .work_flags = IO_WQ_WORK_MM, + }, [IORING_OP_ACCEPT] = { .needs_file = 1, .unbound_nonreg_file = 1, @@ -5671,17 +5678,48 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return 0; } +static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, + struct timespec64 *ts, enum hrtimer_mode mode) +{ + struct io_kiocb *req = io_timeout_extract(ctx, user_data); + struct io_timeout_data *data; + + if (IS_ERR(req)) + return PTR_ERR(req); + + req->timeout.off = 0; /* noseq */ + data = req->async_data; + list_add_tail(&req->timeout.list, &ctx->timeout_list); + hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode); + data->timer.function = io_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); + return 0; +} + static int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_timeout_rem *tr = &req->timeout_rem; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags) + if (sqe->ioprio || sqe->buf_index || sqe->len) return -EINVAL; - req->timeout_rem.addr = READ_ONCE(sqe->addr); + tr->addr = READ_ONCE(sqe->addr); + tr->flags = READ_ONCE(sqe->timeout_flags); + if (tr->flags & IORING_TIMEOUT_UPDATE) { + if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS)) + return -EINVAL; + if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) + return -EFAULT; + } else if (tr->flags) { + /* timeout removal doesn't support flags */ + return -EINVAL; + } + return 0; } @@ -5690,11 +5728,19 @@ static int io_timeout_remove_prep(struct io_kiocb *req, */ static int io_timeout_remove(struct io_kiocb *req) { + struct io_timeout_rem *tr = &req->timeout_rem; struct io_ring_ctx *ctx = req->ctx; int ret; spin_lock_irq(&ctx->completion_lock); - ret = io_timeout_cancel(ctx, req->timeout_rem.addr); + if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) { + enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS) + ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; + + ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); + } else { + ret = io_timeout_cancel(ctx, tr->addr); + } io_cqring_fill_event(req, ret); io_commit_cqring(ctx); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6bb8229de892..d31a2a1e8ef9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -151,6 +151,7 @@ enum { * sqe->timeout_flags */ #define IORING_TIMEOUT_ABS (1U << 0) +#define IORING_TIMEOUT_UPDATE (1U << 1) /* * sqe->splice_flags From dad1b1242fd5717af18ae4ac9d12b9f65849e13a Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Sun, 6 Dec 2020 22:22:42 +0000 Subject: [PATCH 40/43] io_uring: always let io_iopoll_complete() complete polled io Abaci Fuzz reported a double-free or invalid-free BUG in io_commit_cqring(): [ 95.504842] BUG: KASAN: double-free or invalid-free in io_commit_cqring+0x3ec/0x8e0 [ 95.505921] [ 95.506225] CPU: 0 PID: 4037 Comm: io_wqe_worker-0 Tainted: G B W 5.10.0-rc5+ #1 [ 95.507434] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 95.508248] Call Trace: [ 95.508683] dump_stack+0x107/0x163 [ 95.509323] ? io_commit_cqring+0x3ec/0x8e0 [ 95.509982] print_address_description.constprop.0+0x3e/0x60 [ 95.510814] ? vprintk_func+0x98/0x140 [ 95.511399] ? io_commit_cqring+0x3ec/0x8e0 [ 95.512036] ? io_commit_cqring+0x3ec/0x8e0 [ 95.512733] kasan_report_invalid_free+0x51/0x80 [ 95.513431] ? io_commit_cqring+0x3ec/0x8e0 [ 95.514047] __kasan_slab_free+0x141/0x160 [ 95.514699] kfree+0xd1/0x390 [ 95.515182] io_commit_cqring+0x3ec/0x8e0 [ 95.515799] __io_req_complete.part.0+0x64/0x90 [ 95.516483] io_wq_submit_work+0x1fa/0x260 [ 95.517117] io_worker_handle_work+0xeac/0x1c00 [ 95.517828] io_wqe_worker+0xc94/0x11a0 [ 95.518438] ? io_worker_handle_work+0x1c00/0x1c00 [ 95.519151] ? __kthread_parkme+0x11d/0x1d0 [ 95.519806] ? io_worker_handle_work+0x1c00/0x1c00 [ 95.520512] ? io_worker_handle_work+0x1c00/0x1c00 [ 95.521211] kthread+0x396/0x470 [ 95.521727] ? _raw_spin_unlock_irq+0x24/0x30 [ 95.522380] ? kthread_mod_delayed_work+0x180/0x180 [ 95.523108] ret_from_fork+0x22/0x30 [ 95.523684] [ 95.523985] Allocated by task 4035: [ 95.524543] kasan_save_stack+0x1b/0x40 [ 95.525136] __kasan_kmalloc.constprop.0+0xc2/0xd0 [ 95.525882] kmem_cache_alloc_trace+0x17b/0x310 [ 95.533930] io_queue_sqe+0x225/0xcb0 [ 95.534505] io_submit_sqes+0x1768/0x25f0 [ 95.535164] __x64_sys_io_uring_enter+0x89e/0xd10 [ 95.535900] do_syscall_64+0x33/0x40 [ 95.536465] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 95.537199] [ 95.537505] Freed by task 4035: [ 95.538003] kasan_save_stack+0x1b/0x40 [ 95.538599] kasan_set_track+0x1c/0x30 [ 95.539177] kasan_set_free_info+0x1b/0x30 [ 95.539798] __kasan_slab_free+0x112/0x160 [ 95.540427] kfree+0xd1/0x390 [ 95.540910] io_commit_cqring+0x3ec/0x8e0 [ 95.541516] io_iopoll_complete+0x914/0x1390 [ 95.542150] io_do_iopoll+0x580/0x700 [ 95.542724] io_iopoll_try_reap_events.part.0+0x108/0x200 [ 95.543512] io_ring_ctx_wait_and_kill+0x118/0x340 [ 95.544206] io_uring_release+0x43/0x50 [ 95.544791] __fput+0x28d/0x940 [ 95.545291] task_work_run+0xea/0x1b0 [ 95.545873] do_exit+0xb6a/0x2c60 [ 95.546400] do_group_exit+0x12a/0x320 [ 95.546967] __x64_sys_exit_group+0x3f/0x50 [ 95.547605] do_syscall_64+0x33/0x40 [ 95.548155] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The reason is that once we got a non EAGAIN error in io_wq_submit_work(), we'll complete req by calling io_req_complete(), which will hold completion_lock to call io_commit_cqring(), but for polled io, io_iopoll_complete() won't hold completion_lock to call io_commit_cqring(), then there maybe concurrent access to ctx->defer_list, double free may happen. To fix this bug, we always let io_iopoll_complete() complete polled io. Cc: # 5.5+ Reported-by: Abaci Fuzz Signed-off-by: Xiaoguang Wang Reviewed-by: Pavel Begunkov Reviewed-by: Joseph Qi Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b40083cde733..89fd893a6952 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6341,8 +6341,19 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) } if (ret) { - req_set_fail_links(req); - io_req_complete(req, ret); + /* + * io_iopoll_complete() does not hold completion_lock to complete + * polled io, so here for polled io, just mark it done and still let + * io_iopoll_complete() complete it. + */ + if (req->ctx->flags & IORING_SETUP_IOPOLL) { + struct kiocb *kiocb = &req->rw.kiocb; + + kiocb_done(kiocb, ret, NULL); + } else { + req_set_fail_links(req); + io_req_complete(req, ret); + } } return io_steal_work(req); From 31bff9a51b264df6d144931a6a5f1d6cc815ed4b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 6 Dec 2020 22:22:43 +0000 Subject: [PATCH 41/43] io_uring: fix racy IOPOLL completions IOPOLL allows buffer remove/provide requests, but they doesn't synchronise by rules of IOPOLL, namely it have to hold uring_lock. Cc: # 5.7+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 89fd893a6952..d030a4404b8f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4152,11 +4152,17 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock, head = idr_find(&ctx->io_buffer_idr, p->bgid); if (head) ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); - - io_ring_submit_lock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, 0, cs); + + /* need to hold the lock to complete IOPOLL requests */ + if (ctx->flags & IORING_SETUP_IOPOLL) { + __io_req_complete(req, ret, 0, cs); + io_ring_submit_unlock(ctx, !force_nonblock); + } else { + io_ring_submit_unlock(ctx, !force_nonblock); + __io_req_complete(req, ret, 0, cs); + } return 0; } @@ -4241,10 +4247,17 @@ static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock, } } out: - io_ring_submit_unlock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, 0, cs); + + /* need to hold the lock to complete IOPOLL requests */ + if (ctx->flags & IORING_SETUP_IOPOLL) { + __io_req_complete(req, ret, 0, cs); + io_ring_submit_unlock(ctx, !force_nonblock); + } else { + io_ring_submit_unlock(ctx, !force_nonblock); + __io_req_complete(req, ret, 0, cs); + } return 0; } From 634578f800652035debba3098d8ab0d21af7c7a5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 6 Dec 2020 22:22:44 +0000 Subject: [PATCH 42/43] io_uring: fix racy IOPOLL flush overflow It's not safe to call io_cqring_overflow_flush() for IOPOLL mode without hodling uring_lock, because it does synchronisation differently. Make sure we have it. As for io_ring_exit_work(), we don't even need it there because io_ring_ctx_wait_and_kill() already set force flag making all overflowed requests to be dropped. Cc: # 5.5+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d030a4404b8f..c0306f77211a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8654,8 +8654,6 @@ static void io_ring_exit_work(struct work_struct *work) * as nobody else will be looking for them. */ do { - if (ctx->rings) - io_cqring_overflow_flush(ctx, true, NULL, NULL); io_iopoll_try_reap_events(ctx); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); io_ring_ctx_free(ctx); @@ -8665,6 +8663,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); + if (ctx->rings) + io_cqring_overflow_flush(ctx, true, NULL, NULL); mutex_unlock(&ctx->uring_lock); io_kill_timeouts(ctx, NULL, NULL); @@ -8674,8 +8674,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_wq_cancel_all(ctx->io_wq); /* if we failed setting up the ctx, we might not have any rings */ - if (ctx->rings) - io_cqring_overflow_flush(ctx, true, NULL, NULL); io_iopoll_try_reap_events(ctx); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); @@ -8840,7 +8838,9 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, } io_cancel_defer_files(ctx, task, files); + io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); io_cqring_overflow_flush(ctx, true, task, files); + io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); if (!files) __io_uring_cancel_task_requests(ctx, task); @@ -9172,8 +9172,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { + io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); if (!list_empty_careful(&ctx->cq_overflow_list)) io_cqring_overflow_flush(ctx, false, NULL, NULL); + io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) From 59850d226e4907a6f37c1d2fe5ba97546a8691a4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 6 Dec 2020 22:22:45 +0000 Subject: [PATCH 43/43] io_uring: fix io_cqring_events()'s noflush Checking !list_empty(&ctx->cq_overflow_list) around noflush in io_cqring_events() is racy, because if it fails but a request overflowed just after that, io_cqring_overflow_flush() still will be called. Remove the second check, it shouldn't be a problem for performance, because there is cq_check_overflow bit check just above. Cc: # 5.5+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c0306f77211a..f53356ced5ab 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2329,7 +2329,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) * we wake up the task, and the next invocation will flush the * entries. We cannot safely to it from here. */ - if (noflush && !list_empty(&ctx->cq_overflow_list)) + if (noflush) return -1U; io_cqring_overflow_flush(ctx, false, NULL, NULL);