diff --git a/fs/io-wq.c b/fs/io-wq.c index 5cef075c0b37..cc5cf2209fb0 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -69,6 +69,8 @@ struct io_worker { #define IO_WQ_HASH_ORDER 5 #endif +#define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) + struct io_wqe_acct { unsigned nr_workers; unsigned max_workers; @@ -98,6 +100,7 @@ struct io_wqe { struct list_head all_list; struct io_wq *wq; + struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; }; /* @@ -107,8 +110,7 @@ struct io_wq { struct io_wqe **wqes; unsigned long state; - get_work_fn *get_work; - put_work_fn *put_work; + free_work_fn *free_work; struct task_struct *manager; struct user_struct *user; @@ -376,26 +378,35 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) return __io_worker_unuse(wqe, worker); } -static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) +static inline unsigned int io_get_work_hash(struct io_wq_work *work) +{ + return work->flags >> IO_WQ_HASH_SHIFT; +} + +static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) __must_hold(wqe->lock) { struct io_wq_work_node *node, *prev; - struct io_wq_work *work; + struct io_wq_work *work, *tail; + unsigned int hash; wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); /* not hashed, can run anytime */ - if (!(work->flags & IO_WQ_WORK_HASHED)) { - wq_node_del(&wqe->work_list, node, prev); + if (!io_wq_is_hashed(work)) { + wq_list_del(&wqe->work_list, node, prev); return work; } /* hashed, can run if not already running */ - *hash = work->flags >> IO_WQ_HASH_SHIFT; - if (!(wqe->hash_map & BIT_ULL(*hash))) { - wqe->hash_map |= BIT_ULL(*hash); - wq_node_del(&wqe->work_list, node, prev); + hash = io_get_work_hash(work); + if (!(wqe->hash_map & BIT(hash))) { + wqe->hash_map |= BIT(hash); + /* all items with this hash lie in [work, tail] */ + tail = wqe->hash_tail[hash]; + wqe->hash_tail[hash] = NULL; + wq_list_cut(&wqe->work_list, &tail->list, prev); return work; } } @@ -440,16 +451,49 @@ static void io_wq_switch_creds(struct io_worker *worker, worker->saved_creds = old_creds; } +static void io_impersonate_work(struct io_worker *worker, + struct io_wq_work *work) +{ + if (work->files && current->files != work->files) { + task_lock(current); + current->files = work->files; + task_unlock(current); + } + if (work->fs && current->fs != work->fs) + current->fs = work->fs; + if (work->mm != worker->mm) + io_wq_switch_mm(worker, work); + if (worker->cur_creds != work->creds) + io_wq_switch_creds(worker, work); +} + +static void io_assign_current_work(struct io_worker *worker, + struct io_wq_work *work) +{ + if (work) { + /* flush pending signals before assigning new work */ + if (signal_pending(current)) + flush_signals(current); + cond_resched(); + } + + spin_lock_irq(&worker->lock); + worker->cur_work = work; + spin_unlock_irq(&worker->lock); +} + +static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); + static void io_worker_handle_work(struct io_worker *worker) __releases(wqe->lock) { - struct io_wq_work *work, *old_work = NULL, *put_work = NULL; struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; do { - unsigned hash = -1U; - + struct io_wq_work *work; + unsigned int hash; +get_next: /* * If we got some work, mark us as busy. If we didn't, but * the list isn't empty, it means we stalled on hashed work. @@ -457,81 +501,60 @@ static void io_worker_handle_work(struct io_worker *worker) * can't make progress, any work completion or insertion will * clear the stalled flag. */ - work = io_get_next_work(wqe, &hash); + work = io_get_next_work(wqe); if (work) __io_worker_busy(wqe, worker, work); else if (!wq_list_empty(&wqe->work_list)) wqe->flags |= IO_WQE_FLAG_STALLED; spin_unlock_irq(&wqe->lock); - if (put_work && wq->put_work) - wq->put_work(old_work); if (!work) break; -next: - /* flush any pending signals before assigning new work */ - if (signal_pending(current)) - flush_signals(current); + io_assign_current_work(worker, work); - cond_resched(); + /* handle a whole dependent link */ + do { + struct io_wq_work *old_work, *next_hashed, *linked; - spin_lock_irq(&worker->lock); - worker->cur_work = work; - spin_unlock_irq(&worker->lock); + next_hashed = wq_next_work(work); + io_impersonate_work(worker, work); + /* + * OK to set IO_WQ_WORK_CANCEL even for uncancellable + * work, the worker function will do the right thing. + */ + if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) + work->flags |= IO_WQ_WORK_CANCEL; - if (work->flags & IO_WQ_WORK_CB) - work->func(&work); + hash = io_get_work_hash(work); + linked = old_work = work; + linked->func(&linked); + linked = (old_work == linked) ? NULL : linked; - if (work->files && current->files != work->files) { - task_lock(current); - current->files = work->files; - task_unlock(current); - } - if (work->fs && current->fs != work->fs) - current->fs = work->fs; - if (work->mm != worker->mm) - io_wq_switch_mm(worker, work); - if (worker->cur_creds != work->creds) - io_wq_switch_creds(worker, work); - /* - * OK to set IO_WQ_WORK_CANCEL even for uncancellable work, - * the worker function will do the right thing. - */ - if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) - work->flags |= IO_WQ_WORK_CANCEL; - if (worker->mm) - work->flags |= IO_WQ_WORK_HAS_MM; + work = next_hashed; + if (!work && linked && !io_wq_is_hashed(linked)) { + work = linked; + linked = NULL; + } + io_assign_current_work(worker, work); + wq->free_work(old_work); - if (wq->get_work) { - put_work = work; - wq->get_work(work); - } + if (linked) + io_wqe_enqueue(wqe, linked); - old_work = work; - work->func(&work); - - spin_lock_irq(&worker->lock); - worker->cur_work = NULL; - spin_unlock_irq(&worker->lock); + if (hash != -1U && !next_hashed) { + spin_lock_irq(&wqe->lock); + wqe->hash_map &= ~BIT_ULL(hash); + wqe->flags &= ~IO_WQE_FLAG_STALLED; + /* dependent work is not hashed */ + hash = -1U; + /* skip unnecessary unlock-lock wqe->lock */ + if (!work) + goto get_next; + spin_unlock_irq(&wqe->lock); + } + } while (work); spin_lock_irq(&wqe->lock); - - if (hash != -1U) { - wqe->hash_map &= ~BIT_ULL(hash); - wqe->flags &= ~IO_WQE_FLAG_STALLED; - } - if (work && work != old_work) { - spin_unlock_irq(&wqe->lock); - - if (put_work && wq->put_work) { - wq->put_work(put_work); - put_work = NULL; - } - - /* dependent work not hashed */ - hash = -1U; - goto next; - } } while (1); } @@ -747,17 +770,40 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, return true; } -static void io_run_cancel(struct io_wq_work *work) +static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) { + struct io_wq *wq = wqe->wq; + do { struct io_wq_work *old_work = work; work->flags |= IO_WQ_WORK_CANCEL; work->func(&work); work = (work == old_work) ? NULL : work; + wq->free_work(old_work); } while (work); } +static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) +{ + unsigned int hash; + struct io_wq_work *tail; + + if (!io_wq_is_hashed(work)) { +append: + wq_list_add_tail(&work->list, &wqe->work_list); + return; + } + + hash = io_get_work_hash(work); + tail = wqe->hash_tail[hash]; + wqe->hash_tail[hash] = work; + if (!tail) + goto append; + + wq_list_add_after(&work->list, &tail->list, &wqe->work_list); +} + static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); @@ -771,13 +817,13 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) * It's close enough to not be an issue, fork() has the same delay. */ if (unlikely(!io_wq_can_queue(wqe, acct, work))) { - io_run_cancel(work); + io_run_cancel(work, wqe); return; } work_flags = work->flags; spin_lock_irqsave(&wqe->lock, flags); - wq_list_add_tail(&work->list, &wqe->work_list); + io_wqe_insert_work(wqe, work); wqe->flags &= ~IO_WQE_FLAG_STALLED; spin_unlock_irqrestore(&wqe->lock, flags); @@ -794,19 +840,15 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) } /* - * Enqueue work, hashed by some key. Work items that hash to the same value - * will not be done in parallel. Used to limit concurrent writes, generally - * hashed by inode. + * Work items that hash to the same value will not be done in parallel. + * Used to limit concurrent writes, generally hashed by inode. */ -void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val) +void io_wq_hash_work(struct io_wq_work *work, void *val) { - struct io_wqe *wqe = wq->wqes[numa_node_id()]; - unsigned bit; - + unsigned int bit; bit = hash_ptr(val, IO_WQ_HASH_ORDER); work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); - io_wqe_enqueue(wqe, work); } static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) @@ -856,14 +898,13 @@ void io_wq_cancel_all(struct io_wq *wq) } struct io_cb_cancel_data { - struct io_wqe *wqe; - work_cancel_fn *cancel; - void *caller_data; + work_cancel_fn *fn; + void *data; }; -static bool io_work_cancel(struct io_worker *worker, void *cancel_data) +static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { - struct io_cb_cancel_data *data = cancel_data; + struct io_cb_cancel_data *match = data; unsigned long flags; bool ret = false; @@ -874,83 +915,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data) spin_lock_irqsave(&worker->lock, flags); if (worker->cur_work && !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) && - data->cancel(worker->cur_work, data->caller_data)) { - send_sig(SIGINT, worker->task, 1); - ret = true; - } - spin_unlock_irqrestore(&worker->lock, flags); - - return ret; -} - -static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe, - work_cancel_fn *cancel, - void *cancel_data) -{ - struct io_cb_cancel_data data = { - .wqe = wqe, - .cancel = cancel, - .caller_data = cancel_data, - }; - struct io_wq_work_node *node, *prev; - struct io_wq_work *work; - unsigned long flags; - bool found = false; - - spin_lock_irqsave(&wqe->lock, flags); - wq_list_for_each(node, prev, &wqe->work_list) { - work = container_of(node, struct io_wq_work, list); - - if (cancel(work, cancel_data)) { - wq_node_del(&wqe->work_list, node, prev); - found = true; - break; - } - } - spin_unlock_irqrestore(&wqe->lock, flags); - - if (found) { - io_run_cancel(work); - return IO_WQ_CANCEL_OK; - } - - rcu_read_lock(); - found = io_wq_for_each_worker(wqe, io_work_cancel, &data); - rcu_read_unlock(); - return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; -} - -enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, - void *data) -{ - enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; - int node; - - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - ret = io_wqe_cancel_cb_work(wqe, cancel, data); - if (ret != IO_WQ_CANCEL_NOTFOUND) - break; - } - - return ret; -} - -struct work_match { - bool (*fn)(struct io_wq_work *, void *data); - void *data; -}; - -static bool io_wq_worker_cancel(struct io_worker *worker, void *data) -{ - struct work_match *match = data; - unsigned long flags; - bool ret = false; - - spin_lock_irqsave(&worker->lock, flags); - if (match->fn(worker->cur_work, match->data) && - !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) { + match->fn(worker->cur_work, match->data)) { send_sig(SIGINT, worker->task, 1); ret = true; } @@ -960,7 +925,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) } static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, - struct work_match *match) + struct io_cb_cancel_data *match) { struct io_wq_work_node *node, *prev; struct io_wq_work *work; @@ -977,7 +942,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, work = container_of(node, struct io_wq_work, list); if (match->fn(work, match->data)) { - wq_node_del(&wqe->work_list, node, prev); + wq_list_del(&wqe->work_list, node, prev); found = true; break; } @@ -985,7 +950,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, spin_unlock_irqrestore(&wqe->lock, flags); if (found) { - io_run_cancel(work); + io_run_cancel(work, wqe); return IO_WQ_CANCEL_OK; } @@ -1001,60 +966,49 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; } -static bool io_wq_work_match(struct io_wq_work *work, void *data) +enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, + void *data) +{ + struct io_cb_cancel_data match = { + .fn = cancel, + .data = data, + }; + enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; + int node; + + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + + ret = io_wqe_cancel_work(wqe, &match); + if (ret != IO_WQ_CANCEL_NOTFOUND) + break; + } + + return ret; +} + +static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) { return work == data; } enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) { - struct work_match match = { - .fn = io_wq_work_match, - .data = cwork - }; - enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; - int node; - - cwork->flags |= IO_WQ_WORK_CANCEL; - - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - ret = io_wqe_cancel_work(wqe, &match); - if (ret != IO_WQ_CANCEL_NOTFOUND) - break; - } - - return ret; + return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork); } static bool io_wq_pid_match(struct io_wq_work *work, void *data) { pid_t pid = (pid_t) (unsigned long) data; - if (work) - return work->task_pid == pid; - return false; + return work->task_pid == pid; } enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid) { - struct work_match match = { - .fn = io_wq_pid_match, - .data = (void *) (unsigned long) pid - }; - enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; - int node; + void *data = (void *) (unsigned long) pid; - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - ret = io_wqe_cancel_work(wqe, &match); - if (ret != IO_WQ_CANCEL_NOTFOUND) - break; - } - - return ret; + return io_wq_cancel_cb(wq, io_wq_pid_match, data); } struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) @@ -1062,6 +1016,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) int ret = -ENOMEM, node; struct io_wq *wq; + if (WARN_ON_ONCE(!data->free_work)) + return ERR_PTR(-EINVAL); + wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return ERR_PTR(-ENOMEM); @@ -1072,8 +1029,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) return ERR_PTR(-ENOMEM); } - wq->get_work = data->get_work; - wq->put_work = data->put_work; + wq->free_work = data->free_work; /* caller must already hold a reference to this */ wq->user = data->user; @@ -1130,7 +1086,7 @@ err: bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) { - if (data->get_work != wq->get_work || data->put_work != wq->put_work) + if (data->free_work != wq->free_work) return false; return refcount_inc_not_zero(&wq->use_refs); diff --git a/fs/io-wq.h b/fs/io-wq.h index e5e15f2c93ec..3ee7356d6be5 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -5,10 +5,8 @@ struct io_wq; enum { IO_WQ_WORK_CANCEL = 1, - IO_WQ_WORK_HAS_MM = 2, IO_WQ_WORK_HASHED = 4, IO_WQ_WORK_UNBOUND = 32, - IO_WQ_WORK_CB = 128, IO_WQ_WORK_NO_CANCEL = 256, IO_WQ_WORK_CONCURRENT = 512, @@ -30,6 +28,18 @@ struct io_wq_work_list { struct io_wq_work_node *last; }; +static inline void wq_list_add_after(struct io_wq_work_node *node, + struct io_wq_work_node *pos, + struct io_wq_work_list *list) +{ + struct io_wq_work_node *next = pos->next; + + pos->next = node; + node->next = next; + if (!next) + list->last = node; +} + static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { @@ -42,17 +52,26 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, } } -static inline void wq_node_del(struct io_wq_work_list *list, +static inline void wq_list_cut(struct io_wq_work_list *list, + struct io_wq_work_node *last, + struct io_wq_work_node *prev) +{ + /* first in the list, if prev==NULL */ + if (!prev) + WRITE_ONCE(list->first, last->next); + else + prev->next = last->next; + + if (last == list->last) + list->last = prev; + last->next = NULL; +} + +static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work_node *node, struct io_wq_work_node *prev) { - if (node == list->first) - WRITE_ONCE(list->first, node->next); - if (node == list->last) - list->last = prev; - if (prev) - prev->next = node->next; - node->next = NULL; + wq_list_cut(list, node, prev); } #define wq_list_for_each(pos, prv, head) \ @@ -65,10 +84,7 @@ static inline void wq_node_del(struct io_wq_work_list *list, } while (0) struct io_wq_work { - union { - struct io_wq_work_node list; - void *data; - }; + struct io_wq_work_node list; void (*func)(struct io_wq_work **); struct files_struct *files; struct mm_struct *mm; @@ -83,14 +99,20 @@ struct io_wq_work { *(work) = (struct io_wq_work){ .func = _func }; \ } while (0) \ -typedef void (get_work_fn)(struct io_wq_work *); -typedef void (put_work_fn)(struct io_wq_work *); +static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) +{ + if (!work->list.next) + return NULL; + + return container_of(work->list.next, struct io_wq_work, list); +} + +typedef void (free_work_fn)(struct io_wq_work *); struct io_wq_data { struct user_struct *user; - get_work_fn *get_work; - put_work_fn *put_work; + free_work_fn *free_work; }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); @@ -98,7 +120,12 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data); void io_wq_destroy(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); -void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val); +void io_wq_hash_work(struct io_wq_work *work, void *val); + +static inline bool io_wq_is_hashed(struct io_wq_work *work) +{ + return work->flags & IO_WQ_WORK_HASHED; +} void io_wq_cancel_all(struct io_wq *wq); enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); diff --git a/fs/io_uring.c b/fs/io_uring.c index 3affd96a98ba..358f97be9c7b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -76,6 +77,8 @@ #include #include #include +#include +#include #define CREATE_TRACE_POINTS #include @@ -193,6 +196,13 @@ struct fixed_file_data { struct completion done; }; +struct io_buffer { + struct list_head list; + __u64 addr; + __s32 len; + __u16 bid; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -270,6 +280,8 @@ struct io_ring_ctx { struct socket *ring_sock; #endif + struct idr io_buffer_idr; + struct idr personality_idr; struct { @@ -290,7 +302,6 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; - struct llist_head poll_llist; /* * ->poll_list is protected by the ctx->uring_lock for @@ -386,7 +397,9 @@ struct io_sr_msg { void __user *buf; }; int msg_flags; + int bgid; size_t len; + struct io_buffer *kbuf; }; struct io_open { @@ -430,6 +443,24 @@ struct io_epoll { struct epoll_event event; }; +struct io_splice { + struct file *file_out; + struct file *file_in; + loff_t off_out; + loff_t off_in; + u64 len; + unsigned int flags; +}; + +struct io_provide_buf { + struct file *file; + __u64 addr; + __s32 len; + __u32 bgid; + __u16 nbufs; + __u16 bid; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -464,6 +495,7 @@ enum { REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, + REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, REQ_F_LINK_NEXT_BIT, REQ_F_FAIL_LINK_BIT, @@ -479,6 +511,11 @@ enum { REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, REQ_F_OVERFLOW_BIT, + REQ_F_POLLED_BIT, + REQ_F_BUFFER_SELECTED_BIT, + + /* not a real bit, just to check we're not overflowing the space */ + __REQ_F_LAST_BIT, }; enum { @@ -492,6 +529,8 @@ enum { REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), /* IOSQE_ASYNC */ REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + /* IOSQE_BUFFER_SELECT */ + REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), /* already grabbed next link */ REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), @@ -521,6 +560,15 @@ enum { REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), /* in overflow list */ REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT), + /* already went through poll handler */ + REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), + /* buffer already selected */ + REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), +}; + +struct async_poll { + struct io_poll_iocb poll; + struct io_wq_work work; }; /* @@ -546,32 +594,45 @@ struct io_kiocb { struct io_fadvise fadvise; struct io_madvise madvise; struct io_epoll epoll; + struct io_splice splice; + struct io_provide_buf pbuf; }; struct io_async_ctx *io; - /* - * llist_node is only used for poll deferred completions - */ - struct llist_node llist_node; - bool in_async; bool needs_fixed_file; u8 opcode; struct io_ring_ctx *ctx; - union { - struct list_head list; - struct hlist_node hash_node; - }; - struct list_head link_list; + struct list_head list; unsigned int flags; refcount_t refs; + union { + struct task_struct *task; + unsigned long fsize; + }; u64 user_data; u32 result; u32 sequence; + struct list_head link_list; + struct list_head inflight_entry; - struct io_wq_work work; + union { + /* + * Only commands that never go async can use the below fields, + * obviously. Right now only IORING_OP_POLL_ADD uses them, and + * async armed poll handlers for regular commands. The latter + * restore the work, if needed. + */ + struct { + struct callback_head task_work; + struct hlist_node hash_node; + struct async_poll *apoll; + int cflags; + }; + struct io_wq_work work; + }; }; #define IO_PLUG_THRESHOLD 2 @@ -615,6 +676,11 @@ struct io_op_def { unsigned file_table : 1; /* needs ->fs */ unsigned needs_fs : 1; + /* set if opcode supports polled "wait" */ + unsigned pollin : 1; + unsigned pollout : 1; + /* op supports buffer selection */ + unsigned buffer_select : 1; }; static const struct io_op_def io_op_defs[] = { @@ -624,6 +690,8 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_WRITEV] = { .async_ctx = 1, @@ -631,6 +699,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -638,11 +707,13 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_READ_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -658,6 +729,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, + .pollout = 1, }, [IORING_OP_RECVMSG] = { .async_ctx = 1, @@ -665,6 +737,8 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_TIMEOUT] = { .async_ctx = 1, @@ -676,6 +750,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .file_table = 1, + .pollin = 1, }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { @@ -687,6 +762,7 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_FALLOCATE] = { .needs_file = 1, @@ -715,11 +791,14 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_WRITE] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_FADVISE] = { .needs_file = 1, @@ -731,11 +810,14 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_RECV] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_OPENAT2] = { .needs_file = 1, @@ -747,6 +829,13 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .file_table = 1, }, + [IORING_OP_SPLICE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_PROVIDE_BUFFERS] = {}, + [IORING_OP_REMOVE_BUFFERS] = {}, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -761,6 +850,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, static int io_grab_files(struct io_kiocb *req); static void io_ring_file_ref_flush(struct fixed_file_data *data); static void io_cleanup_req(struct io_kiocb *req); +static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, + int fd, struct file **out_file, bool fixed); +static void __io_queue_sqe(struct io_kiocb *req, + const struct io_uring_sqe *sqe); static struct kmem_cache *req_cachep; @@ -827,11 +920,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->completions[0]); init_completion(&ctx->completions[1]); + idr_init(&ctx->io_buffer_idr); idr_init(&ctx->personality_idr); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); - init_llist_head(&ctx->poll_llist); INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); @@ -952,15 +1045,14 @@ static inline void io_req_work_drop_env(struct io_kiocb *req) } } -static inline bool io_prep_async_work(struct io_kiocb *req, +static inline void io_prep_async_work(struct io_kiocb *req, struct io_kiocb **link) { const struct io_op_def *def = &io_op_defs[req->opcode]; - bool do_hashed = false; if (req->flags & REQ_F_ISREG) { if (def->hash_reg_file) - do_hashed = true; + io_wq_hash_work(&req->work, file_inode(req->file)); } else { if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; @@ -969,25 +1061,18 @@ static inline bool io_prep_async_work(struct io_kiocb *req, io_req_work_grab_env(req, def); *link = io_prep_linked_timeout(req); - return do_hashed; } static inline void io_queue_async_work(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link; - bool do_hashed; - do_hashed = io_prep_async_work(req, &link); + io_prep_async_work(req, &link); - trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work, - req->flags); - if (!do_hashed) { - io_wq_enqueue(ctx->io_wq, &req->work); - } else { - io_wq_enqueue_hashed(ctx->io_wq, &req->work, - file_inode(req->file)); - } + trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, + &req->work, req->flags); + io_wq_enqueue(ctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); @@ -1054,24 +1139,19 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) return false; if (!ctx->eventfd_async) return true; - return io_wq_current_is_worker() || in_interrupt(); + return io_wq_current_is_worker(); } -static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev) +static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); if (waitqueue_active(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); - if (trigger_ev) + if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); } -static void io_cqring_ev_posted(struct io_ring_ctx *ctx) -{ - __io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx)); -} - /* Returns true if there are no backlogged entries after the flush */ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { @@ -1108,7 +1188,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); - WRITE_ONCE(cqe->flags, 0); + WRITE_ONCE(cqe->flags, req->cflags); } else { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1132,7 +1212,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) return cqe != NULL; } -static void io_cqring_fill_event(struct io_kiocb *req, long res) +static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; struct io_uring_cqe *cqe; @@ -1148,7 +1228,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) if (likely(cqe)) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); - WRITE_ONCE(cqe->flags, 0); + WRITE_ONCE(cqe->flags, cflags); } else if (ctx->cq_overflow_flushed) { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1160,23 +1240,34 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) req->flags |= REQ_F_OVERFLOW; refcount_inc(&req->refs); req->result = res; + req->cflags = cflags; list_add_tail(&req->list, &ctx->cq_overflow_list); } } -static void io_cqring_add_event(struct io_kiocb *req, long res) +static void io_cqring_fill_event(struct io_kiocb *req, long res) +{ + __io_cqring_fill_event(req, res, 0); +} + +static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - io_cqring_fill_event(req, res); + __io_cqring_fill_event(req, res, cflags); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); } +static void io_cqring_add_event(struct io_kiocb *req, long res) +{ + __io_cqring_add_event(req, res, 0); +} + static inline bool io_is_fallback_req(struct io_kiocb *req) { return req == (struct io_kiocb *) @@ -1246,6 +1337,15 @@ fallback: return NULL; } +static inline void io_put_file(struct io_kiocb *req, struct file *file, + bool fixed) +{ + if (fixed) + percpu_ref_put(&req->ctx->file_data->refs); + else + fput(file); +} + static void __io_req_do_free(struct io_kiocb *req) { if (likely(!io_is_fallback_req(req))) @@ -1256,18 +1356,12 @@ static void __io_req_do_free(struct io_kiocb *req) static void __io_req_aux_free(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - if (req->flags & REQ_F_NEED_CLEANUP) io_cleanup_req(req); kfree(req->io); - if (req->file) { - if (req->flags & REQ_F_FIXED_FILE) - percpu_ref_put(&ctx->file_data->refs); - else - fput(req->file); - } + if (req->file) + io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); io_req_work_drop_env(req); } @@ -1474,6 +1568,30 @@ static void io_free_req(struct io_kiocb *req) io_queue_async_work(nxt); } +static void io_link_work_cb(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *link; + + link = list_first_entry(&req->link_list, struct io_kiocb, link_list); + io_queue_linked_timeout(link); + io_wq_submit_work(workptr); +} + +static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) +{ + struct io_kiocb *link; + const struct io_op_def *def = &io_op_defs[nxt->opcode]; + + if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file) + io_wq_hash_work(&nxt->work, file_inode(nxt->file)); + + *workptr = &nxt->work; + link = io_prep_linked_timeout(nxt); + if (link) + nxt->work.func = io_link_work_cb; +} + /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. @@ -1493,6 +1611,26 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } +static void io_steal_work(struct io_kiocb *req, + struct io_wq_work **workptr) +{ + /* + * It's in an io-wq worker, so there always should be at least + * one reference, which will be dropped in io_put_work() just + * after the current handler returns. + * + * It also means, that if the counter dropped to 1, then there is + * no asynchronous users left, so it's safe to steal the next work. + */ + if (refcount_read(&req->refs) == 1) { + struct io_kiocb *nxt = NULL; + + io_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); + } +} + /* * Must only be used if we don't need to care about links, usually from * within the completion handling itself. @@ -1554,6 +1692,19 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) return true; } +static int io_put_kbuf(struct io_kiocb *req) +{ + struct io_buffer *kbuf; + int cflags; + + kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; + cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; + cflags |= IORING_CQE_F_BUFFER; + req->rw.addr = 0; + kfree(kbuf); + return cflags; +} + /* * Find and free completed poll iocbs */ @@ -1565,10 +1716,15 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, rb.to_free = rb.need_iter = 0; while (!list_empty(done)) { + int cflags = 0; + req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); - io_cqring_fill_event(req, req->result); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_kbuf(req); + + __io_cqring_fill_event(req, req->result, cflags); (*nr_events)++; if (refcount_dec_and_test(&req->refs) && @@ -1577,6 +1733,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, } io_commit_cqring(ctx); + if (ctx->flags & IORING_SETUP_SQPOLL) + io_cqring_ev_posted(ctx); io_free_req_many(ctx, &rb); } @@ -1743,13 +1901,16 @@ static inline void req_set_fail_links(struct io_kiocb *req) static void io_complete_rw_common(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + int cflags = 0; if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); if (res != req->result) req_set_fail_links(req); - io_cqring_add_event(req, res); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_kbuf(req); + __io_cqring_add_event(req, res, cflags); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) @@ -1760,17 +1921,6 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) io_put_req(req); } -static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) -{ - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - struct io_kiocb *nxt = NULL; - - io_complete_rw_common(kiocb, res); - io_put_req_find_next(req, &nxt); - - return nxt; -} - static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); @@ -1841,7 +1991,7 @@ static void io_file_put(struct io_submit_state *state) * assuming most submissions are for one file, or at least that each file * has more than one submission. */ -static struct file *io_file_get(struct io_submit_state *state, int fd) +static struct file *__io_file_get(struct io_submit_state *state, int fd) { if (!state) return fget(fd); @@ -1938,7 +2088,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); - /* we own ->private, reuse it for the buffer index */ + /* we own ->private, reuse it for the buffer index / buffer ID */ req->rw.kiocb.private = (void *) (unsigned long) READ_ONCE(sqe->buf_index); return 0; @@ -1965,15 +2115,14 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } -static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, - bool in_async) +static void kiocb_done(struct kiocb *kiocb, ssize_t ret) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; - if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) - *nxt = __io_complete_rw(kiocb, ret); + if (ret >= 0 && kiocb->ki_complete == io_complete_rw) + io_complete_rw(kiocb, ret, 0); else io_rw_done(kiocb, ret); } @@ -2052,11 +2201,147 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw, return len; } +static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) +{ + if (needs_lock) + mutex_unlock(&ctx->uring_lock); +} + +static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) +{ + /* + * "Normal" inline submissions always hold the uring_lock, since we + * grab it from the system call. Same is true for the SQPOLL offload. + * The only exception is when we've detached the request and issue it + * from an async worker thread, grab the lock for that case. + */ + if (needs_lock) + mutex_lock(&ctx->uring_lock); +} + +static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, + int bgid, struct io_buffer *kbuf, + bool needs_lock) +{ + struct io_buffer *head; + + if (req->flags & REQ_F_BUFFER_SELECTED) + return kbuf; + + io_ring_submit_lock(req->ctx, needs_lock); + + lockdep_assert_held(&req->ctx->uring_lock); + + head = idr_find(&req->ctx->io_buffer_idr, bgid); + if (head) { + if (!list_empty(&head->list)) { + kbuf = list_last_entry(&head->list, struct io_buffer, + list); + list_del(&kbuf->list); + } else { + kbuf = head; + idr_remove(&req->ctx->io_buffer_idr, bgid); + } + if (*len > kbuf->len) + *len = kbuf->len; + } else { + kbuf = ERR_PTR(-ENOBUFS); + } + + io_ring_submit_unlock(req->ctx, needs_lock); + + return kbuf; +} + +static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, + bool needs_lock) +{ + struct io_buffer *kbuf; + int bgid; + + kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; + bgid = (int) (unsigned long) req->rw.kiocb.private; + kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock); + if (IS_ERR(kbuf)) + return kbuf; + req->rw.addr = (u64) (unsigned long) kbuf; + req->flags |= REQ_F_BUFFER_SELECTED; + return u64_to_user_ptr(kbuf->addr); +} + +#ifdef CONFIG_COMPAT +static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, + bool needs_lock) +{ + struct compat_iovec __user *uiov; + compat_ssize_t clen; + void __user *buf; + ssize_t len; + + uiov = u64_to_user_ptr(req->rw.addr); + if (!access_ok(uiov, sizeof(*uiov))) + return -EFAULT; + if (__get_user(clen, &uiov->iov_len)) + return -EFAULT; + if (clen < 0) + return -EINVAL; + + len = clen; + buf = io_rw_buffer_select(req, &len, needs_lock); + if (IS_ERR(buf)) + return PTR_ERR(buf); + iov[0].iov_base = buf; + iov[0].iov_len = (compat_size_t) len; + return 0; +} +#endif + +static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, + bool needs_lock) +{ + struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); + void __user *buf; + ssize_t len; + + if (copy_from_user(iov, uiov, sizeof(*uiov))) + return -EFAULT; + + len = iov[0].iov_len; + if (len < 0) + return -EINVAL; + buf = io_rw_buffer_select(req, &len, needs_lock); + if (IS_ERR(buf)) + return PTR_ERR(buf); + iov[0].iov_base = buf; + iov[0].iov_len = len; + return 0; +} + +static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, + bool needs_lock) +{ + if (req->flags & REQ_F_BUFFER_SELECTED) + return 0; + if (!req->rw.len) + return 0; + else if (req->rw.len > 1) + return -EINVAL; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + return io_compat_import(req, iov, needs_lock); +#endif + + return __io_iov_buffer_select(req, iov, needs_lock); +} + static ssize_t io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct iov_iter *iter) + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock) { void __user *buf = u64_to_user_ptr(req->rw.addr); size_t sqe_len = req->rw.len; + ssize_t ret; u8 opcode; opcode = req->opcode; @@ -2065,12 +2350,20 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return io_import_fixed(req, rw, iter); } - /* buffer index only valid with fixed read/write */ - if (req->rw.kiocb.private) + /* buffer index only valid with fixed read/write, or buffer select */ + if (req->rw.kiocb.private && !(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { - ssize_t ret; + if (req->flags & REQ_F_BUFFER_SELECT) { + buf = io_rw_buffer_select(req, &sqe_len, needs_lock); + if (IS_ERR(buf)) { + *iovec = NULL; + return PTR_ERR(buf); + } + req->rw.len = sqe_len; + } + ret = import_single_range(rw, buf, sqe_len, *iovec, iter); *iovec = NULL; return ret < 0 ? ret : sqe_len; @@ -2086,6 +2379,16 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return iorw->size; } + if (req->flags & REQ_F_BUFFER_SELECT) { + ret = io_iov_buffer_select(req, *iovec, needs_lock); + if (!ret) { + ret = (*iovec)->iov_len; + iov_iter_init(iter, rw, *iovec, 1, ret); + } + *iovec = NULL; + return ret; + } + #ifdef CONFIG_COMPAT if (req->ctx->compat) return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, @@ -2169,12 +2472,18 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, } } +static inline int __io_alloc_async_ctx(struct io_kiocb *req) +{ + req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); + return req->io == NULL; +} + static int io_alloc_async_ctx(struct io_kiocb *req) { if (!io_op_defs[req->opcode].async_ctx) return 0; - req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); - return req->io == NULL; + + return __io_alloc_async_ctx(req); } static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, @@ -2184,7 +2493,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, if (!io_op_defs[req->opcode].async_ctx) return 0; if (!req->io) { - if (io_alloc_async_ctx(req)) + if (__io_alloc_async_ctx(req)) return -ENOMEM; io_req_map_rw(req, io_size, iovec, fast_iov, iter); @@ -2213,7 +2522,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, io = req->io; io->rw.iov = io->rw.fast_iov; req->io = NULL; - ret = io_import_iovec(READ, req, &io->rw.iov, &iter); + ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock); req->io = io; if (ret < 0) return ret; @@ -2222,8 +2531,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_read(struct io_kiocb *req, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; @@ -2231,13 +2539,13 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, size_t iov_count; ssize_t io_size, ret; - ret = io_import_iovec(READ, req, &iovec, &iter); + ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) - req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; + kiocb->ki_flags &= ~IOCB_NOWAIT; req->result = 0; io_size = ret; @@ -2248,10 +2556,8 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; + if (force_nonblock && !io_file_supports_async(req->file)) goto copy_iov; - } iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); @@ -2265,13 +2571,16 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, nxt, req->in_async); + kiocb_done(kiocb, ret2); } else { copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); if (ret) goto out_free; + /* any defer here is final, must blocking retry */ + if (!(req->flags & REQ_F_NOWAIT)) + req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } } @@ -2295,6 +2604,8 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; + req->fsize = rlimit(RLIMIT_FSIZE); + /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; @@ -2302,7 +2613,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, io = req->io; io->rw.iov = io->rw.fast_iov; req->io = NULL; - ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter); + ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock); req->io = io; if (ret < 0) return ret; @@ -2311,8 +2622,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_write(struct io_kiocb *req, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; @@ -2320,7 +2630,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, size_t iov_count; ssize_t ret, io_size; - ret = io_import_iovec(WRITE, req, &iovec, &iter); + ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; @@ -2337,10 +2647,8 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; + if (force_nonblock && !io_file_supports_async(req->file)) goto copy_iov; - } /* file path doesn't support NOWAIT for non-direct_IO */ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && @@ -2367,24 +2675,33 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, } kiocb->ki_flags |= IOCB_WRITE; + if (!force_nonblock) + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; + if (req->file->f_op->write_iter) ret2 = call_write_iter(req->file, kiocb, &iter); else ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + + if (!force_nonblock) + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + /* - * Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just + * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just * retry them without IOCB_NOWAIT. */ if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) ret2 = -EAGAIN; if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, nxt, req->in_async); + kiocb_done(kiocb, ret2); } else { copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); if (ret) goto out_free; + /* any defer here is final, must blocking retry */ + req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } } @@ -2394,6 +2711,76 @@ out_free: return ret; } +static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_splice* sp = &req->splice; + unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; + int ret; + + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; + + sp->file_in = NULL; + sp->off_in = READ_ONCE(sqe->splice_off_in); + sp->off_out = READ_ONCE(sqe->off); + sp->len = READ_ONCE(sqe->len); + sp->flags = READ_ONCE(sqe->splice_flags); + + if (unlikely(sp->flags & ~valid_flags)) + return -EINVAL; + + ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in, + (sp->flags & SPLICE_F_FD_IN_FIXED)); + if (ret) + return ret; + req->flags |= REQ_F_NEED_CLEANUP; + + if (!S_ISREG(file_inode(sp->file_in)->i_mode)) + req->work.flags |= IO_WQ_WORK_UNBOUND; + + return 0; +} + +static bool io_splice_punt(struct file *file) +{ + if (get_pipe_info(file)) + return false; + if (!io_file_supports_async(file)) + return true; + return !(file->f_mode & O_NONBLOCK); +} + +static int io_splice(struct io_kiocb *req, bool force_nonblock) +{ + struct io_splice *sp = &req->splice; + struct file *in = sp->file_in; + struct file *out = sp->file_out; + unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + loff_t *poff_in, *poff_out; + long ret; + + if (force_nonblock) { + if (io_splice_punt(in) || io_splice_punt(out)) + return -EAGAIN; + flags |= SPLICE_F_NONBLOCK; + } + + poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; + poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; + ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + + io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); + req->flags &= ~REQ_F_NEED_CLEANUP; + + io_cqring_add_event(req, ret); + if (ret != sp->len) + req_set_fail_links(req); + io_put_req(req); + return 0; +} + /* * IORING_OP_NOP just posts a completion event, nothing else. */ @@ -2442,85 +2829,63 @@ static bool io_req_cancelled(struct io_kiocb *req) return false; } -static void io_link_work_cb(struct io_wq_work **workptr) +static void __io_fsync(struct io_kiocb *req) { - struct io_wq_work *work = *workptr; - struct io_kiocb *link = work->data; - - io_queue_linked_timeout(link); - work->func = io_wq_submit_work; -} - -static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) -{ - struct io_kiocb *link; - - io_prep_async_work(nxt, &link); - *workptr = &nxt->work; - if (link) { - nxt->work.flags |= IO_WQ_WORK_CB; - nxt->work.func = io_link_work_cb; - nxt->work.data = link; - } -} - -static void io_fsync_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); loff_t end = req->sync.off + req->sync.len; - struct io_kiocb *nxt = NULL; int ret; - if (io_req_cancelled(req)) - return; - ret = vfs_fsync_range(req->file, req->sync.off, end > 0 ? end : LLONG_MAX, req->sync.flags & IORING_FSYNC_DATASYNC); if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); + io_put_req(req); } -static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static void io_fsync_finish(struct io_wq_work **workptr) { - struct io_wq_work *work, *old_work; + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + if (io_req_cancelled(req)) + return; + __io_fsync(req); + io_steal_work(req, workptr); +} + +static int io_fsync(struct io_kiocb *req, bool force_nonblock) +{ /* fsync always requires a blocking context */ if (force_nonblock) { - io_put_req(req); req->work.func = io_fsync_finish; return -EAGAIN; } - - work = old_work = &req->work; - io_fsync_finish(&work); - if (work && work != old_work) - *nxt = container_of(work, struct io_kiocb, work); + __io_fsync(req); return 0; } +static void __io_fallocate(struct io_kiocb *req) +{ + int ret; + + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; + ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, + req->sync.len); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); +} + static void io_fallocate_finish(struct io_wq_work **workptr) { struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; - int ret; if (io_req_cancelled(req)) return; - - ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, - req->sync.len); - if (ret < 0) - req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); + __io_fallocate(req); + io_steal_work(req, workptr); } static int io_fallocate_prep(struct io_kiocb *req, @@ -2532,26 +2897,19 @@ static int io_fallocate_prep(struct io_kiocb *req, req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->addr); req->sync.mode = READ_ONCE(sqe->len); + req->fsize = rlimit(RLIMIT_FSIZE); return 0; } -static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_fallocate(struct io_kiocb *req, bool force_nonblock) { - struct io_wq_work *work, *old_work; - /* fallocate always requiring blocking context */ if (force_nonblock) { - io_put_req(req); req->work.func = io_fallocate_finish; return -EAGAIN; } - work = old_work = &req->work; - io_fallocate_finish(&work); - if (work && work != old_work) - *nxt = container_of(work, struct io_kiocb, work); - + __io_fallocate(req); return 0; } @@ -2626,8 +2984,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_openat2(struct io_kiocb *req, bool force_nonblock) { struct open_flags op; struct file *file; @@ -2658,15 +3015,171 @@ err: if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; } -static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_openat(struct io_kiocb *req, bool force_nonblock) { req->open.how = build_open_how(req->open.how.flags, req->open.how.mode); - return io_openat2(req, nxt, force_nonblock); + return io_openat2(req, force_nonblock); +} + +static int io_remove_buffers_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_provide_buf *p = &req->pbuf; + u64 tmp; + + if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off) + return -EINVAL; + + tmp = READ_ONCE(sqe->fd); + if (!tmp || tmp > USHRT_MAX) + return -EINVAL; + + memset(p, 0, sizeof(*p)); + p->nbufs = tmp; + p->bgid = READ_ONCE(sqe->buf_group); + return 0; +} + +static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, + int bgid, unsigned nbufs) +{ + unsigned i = 0; + + /* shouldn't happen */ + if (!nbufs) + return 0; + + /* the head kbuf is the list itself */ + while (!list_empty(&buf->list)) { + struct io_buffer *nxt; + + nxt = list_first_entry(&buf->list, struct io_buffer, list); + list_del(&nxt->list); + kfree(nxt); + if (++i == nbufs) + return i; + } + i++; + kfree(buf); + idr_remove(&ctx->io_buffer_idr, bgid); + + return i; +} + +static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) +{ + struct io_provide_buf *p = &req->pbuf; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer *head; + int ret = 0; + + io_ring_submit_lock(ctx, !force_nonblock); + + lockdep_assert_held(&ctx->uring_lock); + + ret = -ENOENT; + head = idr_find(&ctx->io_buffer_idr, p->bgid); + if (head) + ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); + + io_ring_submit_lock(ctx, !force_nonblock); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); + return 0; +} + +static int io_provide_buffers_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_provide_buf *p = &req->pbuf; + u64 tmp; + + if (sqe->ioprio || sqe->rw_flags) + return -EINVAL; + + tmp = READ_ONCE(sqe->fd); + if (!tmp || tmp > USHRT_MAX) + return -E2BIG; + p->nbufs = tmp; + p->addr = READ_ONCE(sqe->addr); + p->len = READ_ONCE(sqe->len); + + if (!access_ok(u64_to_user_ptr(p->addr), p->len)) + return -EFAULT; + + p->bgid = READ_ONCE(sqe->buf_group); + tmp = READ_ONCE(sqe->off); + if (tmp > USHRT_MAX) + return -E2BIG; + p->bid = tmp; + return 0; +} + +static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) +{ + struct io_buffer *buf; + u64 addr = pbuf->addr; + int i, bid = pbuf->bid; + + for (i = 0; i < pbuf->nbufs; i++) { + buf = kmalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + break; + + buf->addr = addr; + buf->len = pbuf->len; + buf->bid = bid; + addr += pbuf->len; + bid++; + if (!*head) { + INIT_LIST_HEAD(&buf->list); + *head = buf; + } else { + list_add_tail(&buf->list, &(*head)->list); + } + } + + return i ? i : -ENOMEM; +} + +static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock) +{ + struct io_provide_buf *p = &req->pbuf; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer *head, *list; + int ret = 0; + + io_ring_submit_lock(ctx, !force_nonblock); + + lockdep_assert_held(&ctx->uring_lock); + + list = head = idr_find(&ctx->io_buffer_idr, p->bgid); + + ret = io_add_buffers(p, &head); + if (ret < 0) + goto out; + + if (!list) { + ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1, + GFP_KERNEL); + if (ret < 0) { + __io_remove_buffers(ctx, head, p->bgid, -1U); + goto out; + } + } +out: + io_ring_submit_unlock(ctx, !force_nonblock); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); + return 0; } static int io_epoll_ctl_prep(struct io_kiocb *req, @@ -2694,8 +3207,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #endif } -static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) { #if defined(CONFIG_EPOLL) struct io_epoll *ie = &req->epoll; @@ -2708,7 +3220,7 @@ static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt, if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; #else return -EOPNOTSUPP; @@ -2730,8 +3242,7 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) #endif } -static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_madvise(struct io_kiocb *req, bool force_nonblock) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) struct io_madvise *ma = &req->madvise; @@ -2744,7 +3255,7 @@ static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt, if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; #else return -EOPNOTSUPP; @@ -2762,8 +3273,7 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_fadvise(struct io_kiocb *req, bool force_nonblock) { struct io_fadvise *fa = &req->fadvise; int ret; @@ -2783,7 +3293,7 @@ static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt, if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; } @@ -2820,8 +3330,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_statx(struct io_kiocb *req, bool force_nonblock) { struct io_open *ctx = &req->open; unsigned lookup_flags; @@ -2858,7 +3367,7 @@ err: if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; } @@ -2885,7 +3394,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } /* only called when __close_fd_get_file() is done */ -static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt) +static void __io_close_finish(struct io_kiocb *req) { int ret; @@ -2894,22 +3403,19 @@ static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt) req_set_fail_links(req); io_cqring_add_event(req, ret); fput(req->close.put_file); - io_put_req_find_next(req, nxt); + io_put_req(req); } static void io_close_finish(struct io_wq_work **workptr) { struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; /* not cancellable, don't do io_req_cancelled() */ - __io_close_finish(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); + __io_close_finish(req); + io_steal_work(req, workptr); } -static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_close(struct io_kiocb *req, bool force_nonblock) { int ret; @@ -2919,23 +3425,25 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, return ret; /* if the file has a flush method, be safe and punt to async */ - if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) - goto eagain; + if (req->close.put_file->f_op->flush && force_nonblock) { + /* submission ref will be dropped, take it for async */ + refcount_inc(&req->refs); + + req->work.func = io_close_finish; + /* + * Do manual async queue here to avoid grabbing files - we don't + * need the files, and it'll cause io_close_finish() to close + * the file again and cause a double CQE entry for this request + */ + io_queue_async_work(req); + return 0; + } /* * No ->flush(), safely close from here and just punt the * fput() to async context. */ - __io_close_finish(req, nxt); - return 0; -eagain: - req->work.func = io_close_finish; - /* - * Do manual async queue here to avoid grabbing files - we don't - * need the files, and it'll cause io_close_finish() to close - * the file again and cause a double CQE entry for this request - */ - io_queue_async_work(req); + __io_close_finish(req); return 0; } @@ -2957,47 +3465,62 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static void io_sync_file_range_finish(struct io_wq_work **workptr) +static void __io_sync_file_range(struct io_kiocb *req) { - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; int ret; - if (io_req_cancelled(req)) - return; - ret = sync_file_range(req->file, req->sync.off, req->sync.len, req->sync.flags); if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, &nxt); + io_put_req(req); +} + + +static void io_sync_file_range_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + + if (io_req_cancelled(req)) + return; + __io_sync_file_range(req); + io_put_req(req); /* put submission ref */ if (nxt) io_wq_assign_next(workptr, nxt); } -static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) { - struct io_wq_work *work, *old_work; - /* sync_file_range always requires a blocking context */ if (force_nonblock) { - io_put_req(req); req->work.func = io_sync_file_range_finish; return -EAGAIN; } - work = old_work = &req->work; - io_sync_file_range_finish(&work); - if (work && work != old_work) - *nxt = container_of(work, struct io_kiocb, work); + __io_sync_file_range(req); return 0; } +#if defined(CONFIG_NET) +static int io_setup_async_msg(struct io_kiocb *req, + struct io_async_msghdr *kmsg) +{ + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) { + if (kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); + return -ENOMEM; + } + req->flags |= REQ_F_NEED_CLEANUP; + memcpy(&req->io->msg, kmsg, sizeof(*kmsg)); + return -EAGAIN; +} + static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -#if defined(CONFIG_NET) struct io_sr_msg *sr = &req->sr_msg; struct io_async_ctx *io = req->io; int ret; @@ -3023,15 +3546,10 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; -#else - return -EOPNOTSUPP; -#endif } -static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) { -#if defined(CONFIG_NET) struct io_async_msghdr *kmsg = NULL; struct socket *sock; int ret; @@ -3071,18 +3589,8 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, flags |= MSG_DONTWAIT; ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - if (force_nonblock && ret == -EAGAIN) { - if (req->io) - return -EAGAIN; - if (io_alloc_async_ctx(req)) { - if (kmsg->iov != kmsg->fast_iov) - kfree(kmsg->iov); - return -ENOMEM; - } - req->flags |= REQ_F_NEED_CLEANUP; - memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); - return -EAGAIN; - } + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; } @@ -3093,17 +3601,12 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; -#else - return -EOPNOTSUPP; -#endif } -static int io_send(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_send(struct io_kiocb *req, bool force_nonblock) { -#if defined(CONFIG_NET) struct socket *sock; int ret; @@ -3144,17 +3647,120 @@ static int io_send(struct io_kiocb *req, struct io_kiocb **nxt, io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; -#else - return -EOPNOTSUPP; +} + +static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) +{ + struct io_sr_msg *sr = &req->sr_msg; + struct iovec __user *uiov; + size_t iov_len; + int ret; + + ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr, + &uiov, &iov_len); + if (ret) + return ret; + + if (req->flags & REQ_F_BUFFER_SELECT) { + if (iov_len > 1) + return -EINVAL; + if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov))) + return -EFAULT; + sr->len = io->msg.iov[0].iov_len; + iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1, + sr->len); + io->msg.iov = NULL; + } else { + ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV, + &io->msg.iov, &io->msg.msg.msg_iter); + if (ret > 0) + ret = 0; + } + + return ret; +} + +#ifdef CONFIG_COMPAT +static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_ctx *io) +{ + struct compat_msghdr __user *msg_compat; + struct io_sr_msg *sr = &req->sr_msg; + struct compat_iovec __user *uiov; + compat_uptr_t ptr; + compat_size_t len; + int ret; + + msg_compat = (struct compat_msghdr __user *) sr->msg; + ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr, + &ptr, &len); + if (ret) + return ret; + + uiov = compat_ptr(ptr); + if (req->flags & REQ_F_BUFFER_SELECT) { + compat_ssize_t clen; + + if (len > 1) + return -EINVAL; + if (!access_ok(uiov, sizeof(*uiov))) + return -EFAULT; + if (__get_user(clen, &uiov->iov_len)) + return -EFAULT; + if (clen < 0) + return -EINVAL; + sr->len = io->msg.iov[0].iov_len; + io->msg.iov = NULL; + } else { + ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV, + &io->msg.iov, + &io->msg.msg.msg_iter); + if (ret < 0) + return ret; + } + + return 0; +} #endif + +static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) +{ + io->msg.iov = io->msg.fast_iov; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + return __io_compat_recvmsg_copy_hdr(req, io); +#endif + + return __io_recvmsg_copy_hdr(req, io); +} + +static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, + int *cflags, bool needs_lock) +{ + struct io_sr_msg *sr = &req->sr_msg; + struct io_buffer *kbuf; + + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return NULL; + + kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); + if (IS_ERR(kbuf)) + return kbuf; + + sr->kbuf = kbuf; + req->flags |= REQ_F_BUFFER_SELECTED; + + *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; + *cflags |= IORING_CQE_F_BUFFER; + return kbuf; } static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -#if defined(CONFIG_NET) struct io_sr_msg *sr = &req->sr_msg; struct io_async_ctx *io = req->io; int ret; @@ -3162,6 +3768,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); + sr->bgid = READ_ONCE(sqe->buf_group); #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -3174,30 +3781,24 @@ static int io_recvmsg_prep(struct io_kiocb *req, if (req->flags & REQ_F_NEED_CLEANUP) return 0; - io->msg.iov = io->msg.fast_iov; - ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, - &io->msg.uaddr, &io->msg.iov); + ret = io_recvmsg_copy_hdr(req, io); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; -#else - return -EOPNOTSUPP; -#endif } -static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) { -#if defined(CONFIG_NET) struct io_async_msghdr *kmsg = NULL; struct socket *sock; - int ret; + int ret, cflags = 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; sock = sock_from_file(req->file, &ret); if (sock) { + struct io_buffer *kbuf; struct io_async_ctx io; unsigned flags; @@ -3209,19 +3810,23 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, kmsg->iov = kmsg->fast_iov; kmsg->msg.msg_iter.iov = kmsg->iov; } else { - struct io_sr_msg *sr = &req->sr_msg; - kmsg = &io.msg; kmsg->msg.msg_name = &io.msg.addr; - io.msg.iov = io.msg.fast_iov; - ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg, - sr->msg_flags, &io.msg.uaddr, - &io.msg.iov); + ret = io_recvmsg_copy_hdr(req, &io); if (ret) return ret; } + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) { + return PTR_ERR(kbuf); + } else if (kbuf) { + kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, + 1, req->sr_msg.len); + } + flags = req->sr_msg.msg_flags; if (flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -3230,18 +3835,8 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) { - if (req->io) - return -EAGAIN; - if (io_alloc_async_ctx(req)) { - if (kmsg->iov != kmsg->fast_iov) - kfree(kmsg->iov); - return -ENOMEM; - } - memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); - req->flags |= REQ_F_NEED_CLEANUP; - return -EAGAIN; - } + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; } @@ -3249,22 +3844,18 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); + __io_cqring_add_event(req, ret, cflags); if (ret < 0) req_set_fail_links(req); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; -#else - return -EOPNOTSUPP; -#endif } -static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_recv(struct io_kiocb *req, bool force_nonblock) { -#if defined(CONFIG_NET) + struct io_buffer *kbuf = NULL; struct socket *sock; - int ret; + int ret, cflags = 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -3272,15 +3863,25 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt, sock = sock_from_file(req->file, &ret); if (sock) { struct io_sr_msg *sr = &req->sr_msg; + void __user *buf = sr->buf; struct msghdr msg; struct iovec iov; unsigned flags; - ret = import_single_range(READ, sr->buf, sr->len, &iov, - &msg.msg_iter); - if (ret) - return ret; + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + else if (kbuf) + buf = u64_to_user_ptr(kbuf->addr); + ret = import_single_range(READ, buf, sr->len, &iov, + &msg.msg_iter); + if (ret) { + kfree(kbuf); + return ret; + } + + req->flags |= REQ_F_NEED_CLEANUP; msg.msg_name = NULL; msg.msg_control = NULL; msg.msg_controllen = 0; @@ -3301,20 +3902,17 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt, ret = -EINTR; } - io_cqring_add_event(req, ret); + kfree(kbuf); + req->flags &= ~REQ_F_NEED_CLEANUP; + __io_cqring_add_event(req, ret, cflags); if (ret < 0) req_set_fail_links(req); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; -#else - return -EOPNOTSUPP; -#endif } - static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -#if defined(CONFIG_NET) struct io_accept *accept = &req->accept; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) @@ -3327,14 +3925,9 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); return 0; -#else - return -EOPNOTSUPP; -#endif } -#if defined(CONFIG_NET) -static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int __io_accept(struct io_kiocb *req, bool force_nonblock) { struct io_accept *accept = &req->accept; unsigned file_flags; @@ -3351,44 +3944,34 @@ static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt, if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; } static void io_accept_finish(struct io_wq_work **workptr) { struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; if (io_req_cancelled(req)) return; - __io_accept(req, &nxt, false); - if (nxt) - io_wq_assign_next(workptr, nxt); + __io_accept(req, false); + io_steal_work(req, workptr); } -#endif -static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_accept(struct io_kiocb *req, bool force_nonblock) { -#if defined(CONFIG_NET) int ret; - ret = __io_accept(req, nxt, force_nonblock); + ret = __io_accept(req, force_nonblock); if (ret == -EAGAIN && force_nonblock) { req->work.func = io_accept_finish; - io_put_req(req); return -EAGAIN; } return 0; -#else - return -EOPNOTSUPP; -#endif } static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -#if defined(CONFIG_NET) struct io_connect *conn = &req->connect; struct io_async_ctx *io = req->io; @@ -3405,15 +3988,10 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return move_addr_to_kernel(conn->addr, conn->addr_len, &io->connect.address); -#else - return -EOPNOTSUPP; -#endif } -static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_connect(struct io_kiocb *req, bool force_nonblock) { -#if defined(CONFIG_NET) struct io_async_ctx __io, *io; unsigned file_flags; int ret; @@ -3449,25 +4027,301 @@ out: if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_put_req(req); return 0; -#else +} +#else /* !CONFIG_NET */ +static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ return -EOPNOTSUPP; -#endif } -static void io_poll_remove_one(struct io_kiocb *req) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) { - struct io_poll_iocb *poll = &req->poll; + return -EOPNOTSUPP; +} + +static int io_send(struct io_kiocb *req, bool force_nonblock) +{ + return -EOPNOTSUPP; +} + +static int io_recvmsg_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + return -EOPNOTSUPP; +} + +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) +{ + return -EOPNOTSUPP; +} + +static int io_recv(struct io_kiocb *req, bool force_nonblock) +{ + return -EOPNOTSUPP; +} + +static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return -EOPNOTSUPP; +} + +static int io_accept(struct io_kiocb *req, bool force_nonblock) +{ + return -EOPNOTSUPP; +} + +static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return -EOPNOTSUPP; +} + +static int io_connect(struct io_kiocb *req, bool force_nonblock) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_NET */ + +struct io_poll_table { + struct poll_table_struct pt; + struct io_kiocb *req; + int error; +}; + +static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, + struct wait_queue_head *head) +{ + if (unlikely(poll->head)) { + pt->error = -EINVAL; + return; + } + + pt->error = 0; + poll->head = head; + add_wait_queue(head, &poll->wait); +} + +static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + + __io_queue_proc(&pt->req->apoll->poll, pt, head); +} + +static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, + __poll_t mask, task_work_func_t func) +{ + struct task_struct *tsk; + + /* for instances that support it check for an event match first: */ + if (mask && !(mask & poll->events)) + return 0; + + trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); + + list_del_init(&poll->wait.entry); + + tsk = req->task; + req->result = mask; + init_task_work(&req->task_work, func); + /* + * If this fails, then the task is exiting. If that is the case, then + * the exit check will ultimately cancel these work items. Hence we + * don't need to check here and handle it specifically. + */ + task_work_add(tsk, &req->task_work, true); + wake_up_process(tsk); + return 1; +} + +static void io_async_task_func(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct async_poll *apoll = req->apoll; + struct io_ring_ctx *ctx = req->ctx; + + trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); + + WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry)); + + if (hash_hashed(&req->hash_node)) { + spin_lock_irq(&ctx->completion_lock); + hash_del(&req->hash_node); + spin_unlock_irq(&ctx->completion_lock); + } + + /* restore ->work in case we need to retry again */ + memcpy(&req->work, &apoll->work, sizeof(req->work)); + + __set_current_state(TASK_RUNNING); + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(req, NULL); + mutex_unlock(&ctx->uring_lock); + + kfree(apoll); +} + +static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) +{ + struct io_kiocb *req = wait->private; + struct io_poll_iocb *poll = &req->apoll->poll; + + trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data, + key_to_poll(key)); + + return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func); +} + +static void io_poll_req_insert(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct hlist_head *list; + + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); +} + +static __poll_t __io_arm_poll_handler(struct io_kiocb *req, + struct io_poll_iocb *poll, + struct io_poll_table *ipt, __poll_t mask, + wait_queue_func_t wake_func) + __acquires(&ctx->completion_lock) +{ + struct io_ring_ctx *ctx = req->ctx; + bool cancel = false; + + poll->file = req->file; + poll->head = NULL; + poll->done = poll->canceled = false; + poll->events = mask; + + ipt->pt._key = mask; + ipt->req = req; + ipt->error = -EINVAL; + + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, wake_func); + poll->wait.private = req; + + mask = vfs_poll(req->file, &ipt->pt) & poll->events; + + spin_lock_irq(&ctx->completion_lock); + if (likely(poll->head)) { + spin_lock(&poll->head->lock); + if (unlikely(list_empty(&poll->wait.entry))) { + if (ipt->error) + cancel = true; + ipt->error = 0; + mask = 0; + } + if (mask || ipt->error) + list_del_init(&poll->wait.entry); + else if (cancel) + WRITE_ONCE(poll->canceled, true); + else if (!poll->done) /* actually waiting for an event */ + io_poll_req_insert(req); + spin_unlock(&poll->head->lock); + } + + return mask; +} + +static bool io_arm_poll_handler(struct io_kiocb *req) +{ + const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_ring_ctx *ctx = req->ctx; + struct async_poll *apoll; + struct io_poll_table ipt; + __poll_t mask, ret; + + if (!req->file || !file_can_poll(req->file)) + return false; + if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED)) + return false; + if (!def->pollin && !def->pollout) + return false; + + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (unlikely(!apoll)) + return false; + + req->flags |= REQ_F_POLLED; + memcpy(&apoll->work, &req->work, sizeof(req->work)); + + /* + * Don't need a reference here, as we're adding it to the task + * task_works list. If the task exits, the list is pruned. + */ + req->task = current; + req->apoll = apoll; + INIT_HLIST_NODE(&req->hash_node); + + mask = 0; + if (def->pollin) + mask |= POLLIN | POLLRDNORM; + if (def->pollout) + mask |= POLLOUT | POLLWRNORM; + mask |= POLLERR | POLLPRI; + + ipt.pt._qproc = io_async_queue_proc; + + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, + io_async_wake); + if (ret) { + ipt.error = 0; + apoll->poll.done = true; + spin_unlock_irq(&ctx->completion_lock); + memcpy(&req->work, &apoll->work, sizeof(req->work)); + kfree(apoll); + return false; + } + spin_unlock_irq(&ctx->completion_lock); + trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask, + apoll->poll.events); + return true; +} + +static bool __io_poll_remove_one(struct io_kiocb *req, + struct io_poll_iocb *poll) +{ + bool do_complete = false; spin_lock(&poll->head->lock); WRITE_ONCE(poll->canceled, true); if (!list_empty(&poll->wait.entry)) { list_del_init(&poll->wait.entry); - io_queue_async_work(req); + do_complete = true; } spin_unlock(&poll->head->lock); + return do_complete; +} + +static bool io_poll_remove_one(struct io_kiocb *req) +{ + bool do_complete; + + if (req->opcode == IORING_OP_POLL_ADD) { + do_complete = __io_poll_remove_one(req, &req->poll); + } else { + /* non-poll requests have submit ref still */ + do_complete = __io_poll_remove_one(req, &req->apoll->poll); + if (do_complete) + io_put_req(req); + } + hash_del(&req->hash_node); + + if (do_complete) { + io_cqring_fill_event(req, -ECANCELED); + io_commit_cqring(req->ctx); + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + } + + return do_complete; } static void io_poll_remove_all(struct io_ring_ctx *ctx) @@ -3485,6 +4339,8 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx) io_poll_remove_one(req); } spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); } static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) @@ -3494,10 +4350,11 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; hlist_for_each_entry(req, list, hash_node) { - if (sqe_addr == req->user_data) { - io_poll_remove_one(req); + if (sqe_addr != req->user_data) + continue; + if (io_poll_remove_one(req)) return 0; - } + return -EALREADY; } return -ENOENT; @@ -3543,186 +4400,54 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) struct io_ring_ctx *ctx = req->ctx; req->poll.done = true; - if (error) - io_cqring_fill_event(req, error); - else - io_cqring_fill_event(req, mangle_poll(mask)); + io_cqring_fill_event(req, error ? error : mangle_poll(mask)); io_commit_cqring(ctx); } -static void io_poll_complete_work(struct io_wq_work **workptr) +static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) { - struct io_wq_work *work = *workptr; - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_poll_iocb *poll = &req->poll; - struct poll_table_struct pt = { ._key = poll->events }; struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *nxt = NULL; - __poll_t mask = 0; - int ret = 0; - if (work->flags & IO_WQ_WORK_CANCEL) { - WRITE_ONCE(poll->canceled, true); - ret = -ECANCELED; - } else if (READ_ONCE(poll->canceled)) { - ret = -ECANCELED; - } - - if (ret != -ECANCELED) - mask = vfs_poll(poll->file, &pt) & poll->events; - - /* - * Note that ->ki_cancel callers also delete iocb from active_reqs after - * calling ->ki_cancel. We need the ctx_lock roundtrip here to - * synchronize with them. In the cancellation case the list_del_init - * itself is not actually needed, but harmless so we keep it in to - * avoid further branches in the fast path. - */ spin_lock_irq(&ctx->completion_lock); - if (!mask && ret != -ECANCELED) { - add_wait_queue(poll->head, &poll->wait); - spin_unlock_irq(&ctx->completion_lock); - return; - } hash_del(&req->hash_node); - io_poll_complete(req, mask, ret); + io_poll_complete(req, req->result, 0); + req->flags |= REQ_F_COMP_LOCKED; + io_put_req_find_next(req, nxt); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - - if (ret < 0) - req_set_fail_links(req); - io_put_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); } -static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes) +static void io_poll_task_func(struct callback_head *cb) { - struct io_kiocb *req, *tmp; - struct req_batch rb; + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_kiocb *nxt = NULL; - rb.to_free = rb.need_iter = 0; - spin_lock_irq(&ctx->completion_lock); - llist_for_each_entry_safe(req, tmp, nodes, llist_node) { - hash_del(&req->hash_node); - io_poll_complete(req, req->result, 0); + io_poll_task_handler(req, &nxt); + if (nxt) { + struct io_ring_ctx *ctx = nxt->ctx; - if (refcount_dec_and_test(&req->refs) && - !io_req_multi_free(&rb, req)) { - req->flags |= REQ_F_COMP_LOCKED; - io_free_req(req); - } + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(nxt, NULL); + mutex_unlock(&ctx->uring_lock); } - spin_unlock_irq(&ctx->completion_lock); - - io_cqring_ev_posted(ctx); - io_free_req_many(ctx, &rb); -} - -static void io_poll_flush(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct llist_node *nodes; - - nodes = llist_del_all(&req->ctx->poll_llist); - if (nodes) - __io_poll_flush(req->ctx, nodes); -} - -static void io_poll_trigger_evfd(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - - eventfd_signal(req->ctx->cq_ev_fd, 1); - io_put_req(req); } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { - struct io_poll_iocb *poll = wait->private; - struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); - struct io_ring_ctx *ctx = req->ctx; - __poll_t mask = key_to_poll(key); + struct io_kiocb *req = wait->private; + struct io_poll_iocb *poll = &req->poll; - /* for instances that support it check for an event match first: */ - if (mask && !(mask & poll->events)) - return 0; - - list_del_init(&poll->wait.entry); - - /* - * Run completion inline if we can. We're using trylock here because - * we are violating the completion_lock -> poll wq lock ordering. - * If we have a link timeout we're going to need the completion_lock - * for finalizing the request, mark us as having grabbed that already. - */ - if (mask) { - unsigned long flags; - - if (llist_empty(&ctx->poll_llist) && - spin_trylock_irqsave(&ctx->completion_lock, flags)) { - bool trigger_ev; - - hash_del(&req->hash_node); - io_poll_complete(req, mask, 0); - - trigger_ev = io_should_trigger_evfd(ctx); - if (trigger_ev && eventfd_signal_count()) { - trigger_ev = false; - req->work.func = io_poll_trigger_evfd; - } else { - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - req = NULL; - } - spin_unlock_irqrestore(&ctx->completion_lock, flags); - __io_cqring_ev_posted(ctx, trigger_ev); - } else { - req->result = mask; - req->llist_node.next = NULL; - /* if the list wasn't empty, we're done */ - if (!llist_add(&req->llist_node, &ctx->poll_llist)) - req = NULL; - else - req->work.func = io_poll_flush; - } - } - if (req) - io_queue_async_work(req); - - return 1; + return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func); } -struct io_poll_table { - struct poll_table_struct pt; - struct io_kiocb *req; - int error; -}; - static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - if (unlikely(pt->req->poll.head)) { - pt->error = -EINVAL; - return; - } - - pt->error = 0; - pt->req->poll.head = head; - add_wait_queue(head, &pt->req->poll.wait); -} - -static void io_poll_req_insert(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct hlist_head *list; - - list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; - hlist_add_head(&req->hash_node, list); + __io_queue_proc(&pt->req->poll, pt, head); } static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -3739,55 +4464,29 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + + /* + * Don't need a reference here, as we're adding it to the task + * task_works list. If the task exits, the list is pruned. + */ + req->task = current; return 0; } -static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) +static int io_poll_add(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; struct io_ring_ctx *ctx = req->ctx; struct io_poll_table ipt; - bool cancel = false; __poll_t mask; - INIT_IO_WORK(&req->work, io_poll_complete_work); INIT_HLIST_NODE(&req->hash_node); - - poll->head = NULL; - poll->done = false; - poll->canceled = false; - - ipt.pt._qproc = io_poll_queue_proc; - ipt.pt._key = poll->events; - ipt.req = req; - ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ - - /* initialized the list so that we can do list_empty checks */ - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, io_poll_wake); - poll->wait.private = poll; - INIT_LIST_HEAD(&req->list); + ipt.pt._qproc = io_poll_queue_proc; - mask = vfs_poll(poll->file, &ipt.pt) & poll->events; + mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, + io_poll_wake); - spin_lock_irq(&ctx->completion_lock); - if (likely(poll->head)) { - spin_lock(&poll->head->lock); - if (unlikely(list_empty(&poll->wait.entry))) { - if (ipt.error) - cancel = true; - ipt.error = 0; - mask = 0; - } - if (mask || ipt.error) - list_del_init(&poll->wait.entry); - else if (cancel) - WRITE_ONCE(poll->canceled, true); - else if (!poll->done) /* actually waiting for an event */ - io_poll_req_insert(req); - spin_unlock(&poll->head->lock); - } if (mask) { /* no async, we'd stolen it */ ipt.error = 0; io_poll_complete(req, mask, 0); @@ -3796,7 +4495,7 @@ static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) if (mask) { io_cqring_ev_posted(ctx); - io_put_req_find_next(req, nxt); + io_put_req(req); } return ipt.error; } @@ -4045,7 +4744,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) static void io_async_find_and_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req, __u64 sqe_addr, - struct io_kiocb **nxt, int success_ret) + int success_ret) { unsigned long flags; int ret; @@ -4071,7 +4770,7 @@ done: if (ret < 0) req_set_fail_links(req); - io_put_req_find_next(req, nxt); + io_put_req(req); } static int io_async_cancel_prep(struct io_kiocb *req, @@ -4087,11 +4786,11 @@ static int io_async_cancel_prep(struct io_kiocb *req, return 0; } -static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) +static int io_async_cancel(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0); + io_async_find_and_cancel(ctx, req, req->cancel.addr, 0); return 0; } @@ -4226,6 +4925,15 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_EPOLL_CTL: ret = io_epoll_ctl_prep(req, sqe); break; + case IORING_OP_SPLICE: + ret = io_splice_prep(req, sqe); + break; + case IORING_OP_PROVIDE_BUFFERS: + ret = io_provide_buffers_prep(req, sqe); + break; + case IORING_OP_REMOVE_BUFFERS: + ret = io_remove_buffers_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -4272,29 +4980,43 @@ static void io_cleanup_req(struct io_kiocb *req) case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: + if (req->flags & REQ_F_BUFFER_SELECTED) + kfree((void *)(unsigned long)req->rw.addr); + /* fallthrough */ case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: if (io->rw.iov != io->rw.fast_iov) kfree(io->rw.iov); break; - case IORING_OP_SENDMSG: case IORING_OP_RECVMSG: + if (req->flags & REQ_F_BUFFER_SELECTED) + kfree(req->sr_msg.kbuf); + /* fallthrough */ + case IORING_OP_SENDMSG: if (io->msg.iov != io->msg.fast_iov) kfree(io->msg.iov); break; + case IORING_OP_RECV: + if (req->flags & REQ_F_BUFFER_SELECTED) + kfree(req->sr_msg.kbuf); + break; case IORING_OP_OPENAT: case IORING_OP_OPENAT2: case IORING_OP_STATX: putname(req->open.filename); break; + case IORING_OP_SPLICE: + io_put_file(req, req->splice.file_in, + (req->splice.flags & SPLICE_F_FD_IN_FIXED)); + break; } req->flags &= ~REQ_F_NEED_CLEANUP; } static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) + bool force_nonblock) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -4311,7 +5033,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_read(req, nxt, force_nonblock); + ret = io_read(req, force_nonblock); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: @@ -4321,7 +5043,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_write(req, nxt, force_nonblock); + ret = io_write(req, force_nonblock); break; case IORING_OP_FSYNC: if (sqe) { @@ -4329,7 +5051,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_fsync(req, nxt, force_nonblock); + ret = io_fsync(req, force_nonblock); break; case IORING_OP_POLL_ADD: if (sqe) { @@ -4337,7 +5059,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_poll_add(req, nxt); + ret = io_poll_add(req); break; case IORING_OP_POLL_REMOVE: if (sqe) { @@ -4353,7 +5075,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_sync_file_range(req, nxt, force_nonblock); + ret = io_sync_file_range(req, force_nonblock); break; case IORING_OP_SENDMSG: case IORING_OP_SEND: @@ -4363,9 +5085,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_SENDMSG) - ret = io_sendmsg(req, nxt, force_nonblock); + ret = io_sendmsg(req, force_nonblock); else - ret = io_send(req, nxt, force_nonblock); + ret = io_send(req, force_nonblock); break; case IORING_OP_RECVMSG: case IORING_OP_RECV: @@ -4375,9 +5097,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_RECVMSG) - ret = io_recvmsg(req, nxt, force_nonblock); + ret = io_recvmsg(req, force_nonblock); else - ret = io_recv(req, nxt, force_nonblock); + ret = io_recv(req, force_nonblock); break; case IORING_OP_TIMEOUT: if (sqe) { @@ -4401,7 +5123,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_accept(req, nxt, force_nonblock); + ret = io_accept(req, force_nonblock); break; case IORING_OP_CONNECT: if (sqe) { @@ -4409,7 +5131,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_connect(req, nxt, force_nonblock); + ret = io_connect(req, force_nonblock); break; case IORING_OP_ASYNC_CANCEL: if (sqe) { @@ -4417,7 +5139,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_async_cancel(req, nxt); + ret = io_async_cancel(req); break; case IORING_OP_FALLOCATE: if (sqe) { @@ -4425,7 +5147,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_fallocate(req, nxt, force_nonblock); + ret = io_fallocate(req, force_nonblock); break; case IORING_OP_OPENAT: if (sqe) { @@ -4433,7 +5155,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_openat(req, nxt, force_nonblock); + ret = io_openat(req, force_nonblock); break; case IORING_OP_CLOSE: if (sqe) { @@ -4441,7 +5163,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_close(req, nxt, force_nonblock); + ret = io_close(req, force_nonblock); break; case IORING_OP_FILES_UPDATE: if (sqe) { @@ -4457,7 +5179,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_statx(req, nxt, force_nonblock); + ret = io_statx(req, force_nonblock); break; case IORING_OP_FADVISE: if (sqe) { @@ -4465,7 +5187,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_fadvise(req, nxt, force_nonblock); + ret = io_fadvise(req, force_nonblock); break; case IORING_OP_MADVISE: if (sqe) { @@ -4473,7 +5195,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_madvise(req, nxt, force_nonblock); + ret = io_madvise(req, force_nonblock); break; case IORING_OP_OPENAT2: if (sqe) { @@ -4481,7 +5203,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_openat2(req, nxt, force_nonblock); + ret = io_openat2(req, force_nonblock); break; case IORING_OP_EPOLL_CTL: if (sqe) { @@ -4489,7 +5211,31 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_epoll_ctl(req, nxt, force_nonblock); + ret = io_epoll_ctl(req, force_nonblock); + break; + case IORING_OP_SPLICE: + if (sqe) { + ret = io_splice_prep(req, sqe); + if (ret < 0) + break; + } + ret = io_splice(req, force_nonblock); + break; + case IORING_OP_PROVIDE_BUFFERS: + if (sqe) { + ret = io_provide_buffers_prep(req, sqe); + if (ret) + break; + } + ret = io_provide_buffers(req, force_nonblock); + break; + case IORING_OP_REMOVE_BUFFERS: + if (sqe) { + ret = io_remove_buffers_prep(req, sqe); + if (ret) + break; + } + ret = io_remove_buffers(req, force_nonblock); break; default: ret = -EINVAL; @@ -4522,7 +5268,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) { struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; int ret = 0; /* if NO_CANCEL is set, we must still run the work */ @@ -4532,9 +5277,8 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } if (!ret) { - req->in_async = true; do { - ret = io_issue_sqe(req, NULL, &nxt, false); + ret = io_issue_sqe(req, NULL, false); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -4546,18 +5290,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } while (1); } - /* drop submission reference */ - io_put_req(req); - if (ret) { req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req(req); } - /* if a dependent link is ready, pass it back */ - if (!ret && nxt) - io_wq_assign_next(workptr, nxt); + io_steal_work(req, workptr); } static int io_req_needs_file(struct io_kiocb *req, int fd) @@ -4578,12 +5317,38 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, return table->files[index & IORING_FILE_TABLE_MASK];; } +static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, + int fd, struct file **out_file, bool fixed) +{ + struct io_ring_ctx *ctx = req->ctx; + struct file *file; + + if (fixed) { + if (unlikely(!ctx->file_data || + (unsigned) fd >= ctx->nr_user_files)) + return -EBADF; + fd = array_index_nospec(fd, ctx->nr_user_files); + file = io_file_from_index(ctx, fd); + if (!file) + return -EBADF; + percpu_ref_get(&ctx->file_data->refs); + } else { + trace_io_uring_file_get(ctx, fd); + file = __io_file_get(state, fd); + if (unlikely(!file)) + return -EBADF; + } + + *out_file = file; + return 0; +} + static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; unsigned flags; int fd; + bool fixed; flags = READ_ONCE(sqe->flags); fd = READ_ONCE(sqe->fd); @@ -4591,26 +5356,11 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, if (!io_req_needs_file(req, fd)) return 0; - if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->file_data || - (unsigned) fd >= ctx->nr_user_files)) - return -EBADF; - fd = array_index_nospec(fd, ctx->nr_user_files); - req->file = io_file_from_index(ctx, fd); - if (!req->file) - return -EBADF; - req->flags |= REQ_F_FIXED_FILE; - percpu_ref_get(&ctx->file_data->refs); - } else { - if (req->needs_fixed_file) - return -EBADF; - trace_io_uring_file_get(ctx, fd); - req->file = io_file_get(state, fd); - if (unlikely(!req->file)) - return -EBADF; - } + fixed = (flags & IOSQE_FIXED_FILE); + if (unlikely(!fixed && req->needs_fixed_file)) + return -EBADF; - return 0; + return io_file_get(state, req, fd, &req->file, fixed); } static int io_grab_files(struct io_kiocb *req) @@ -4672,8 +5422,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) if (prev) { req_set_fail_links(prev); - io_async_find_and_cancel(ctx, req, prev->user_data, NULL, - -ETIME); + io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); io_put_req(prev); } else { io_cqring_add_event(req, -ETIME); @@ -4710,6 +5459,9 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!(req->flags & REQ_F_LINK)) return NULL; + /* for polled retry, if flag is set, we already went through here */ + if (req->flags & REQ_F_POLLED) + return NULL; nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, link_list); @@ -4723,7 +5475,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_kiocb *linked_timeout; - struct io_kiocb *nxt = NULL; + struct io_kiocb *nxt; const struct cred *old_creds = NULL; int ret; @@ -4739,7 +5491,7 @@ again: old_creds = override_creds(req->work.creds); } - ret = io_issue_sqe(req, sqe, &nxt, true); + ret = io_issue_sqe(req, sqe, true); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -4747,6 +5499,11 @@ again: */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { + if (io_arm_poll_handler(req)) { + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); + goto exit; + } punt: if (io_op_defs[req->opcode].file_table) { ret = io_grab_files(req); @@ -4759,10 +5516,11 @@ punt: * submit reference when the iocb is actually submitted. */ io_queue_async_work(req); - goto done_req; + goto exit; } err: + nxt = NULL; /* drop submission reference */ io_put_req_find_next(req, &nxt); @@ -4779,15 +5537,14 @@ err: req_set_fail_links(req); io_put_req(req); } -done_req: if (nxt) { req = nxt; - nxt = NULL; if (req->flags & REQ_F_FORCE_ASYNC) goto punt; goto again; } +exit: if (old_creds) revert_creds(old_creds); } @@ -4829,7 +5586,8 @@ static inline void io_queue_link_head(struct io_kiocb *req) } #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC) + IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ + IOSQE_BUFFER_SELECT) static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_submit_state *state, struct io_kiocb **link) @@ -4846,6 +5604,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, goto err_req; } + if ((sqe_flags & IOSQE_BUFFER_SELECT) && + !io_op_defs[req->opcode].buffer_select) { + ret = -EOPNOTSUPP; + goto err_req; + } + id = READ_ONCE(sqe->personality); if (id) { req->work.creds = idr_find(&ctx->personality_idr, id); @@ -4857,8 +5621,9 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK| - IOSQE_ASYNC); + req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK | + IOSQE_ASYNC | IOSQE_FIXED_FILE | + IOSQE_BUFFER_SELECT); ret = io_req_set_file(state, req, sqe); if (unlikely(ret)) { @@ -5079,7 +5844,6 @@ fail_req: *mm = ctx->sqo_mm; } - req->in_async = async; req->needs_fixed_file = async; trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, true, async); @@ -5163,6 +5927,8 @@ static int io_sq_thread(void *data) if (!list_empty(&ctx->poll_list) || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { + if (current->task_works) + task_work_run(); cond_resched(); continue; } @@ -5194,6 +5960,10 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); break; } + if (current->task_works) { + task_work_run(); + continue; + } if (signal_pending(current)) flush_signals(current); schedule(); @@ -5213,6 +5983,9 @@ static int io_sq_thread(void *data) timeout = jiffies + ctx->sq_thread_idle; } + if (current->task_works) + task_work_run(); + set_fs(old_fs); if (cur_mm) { unuse_mm(cur_mm); @@ -5277,8 +6050,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, struct io_rings *rings = ctx->rings; int ret = 0; - if (io_cqring_events(ctx, false) >= min_events) - return 0; + do { + if (io_cqring_events(ctx, false) >= min_events) + return 0; + if (!current->task_works) + break; + task_work_run(); + } while (1); if (sig) { #ifdef CONFIG_COMPAT @@ -5298,6 +6076,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, do { prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); + if (current->task_works) + task_work_run(); if (io_should_wake(&iowq, false)) break; schedule(); @@ -5607,7 +6387,6 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file) struct io_file_put { struct llist_node llist; struct file *file; - struct completion *done; }; static void io_ring_file_ref_flush(struct fixed_file_data *data) @@ -5618,10 +6397,7 @@ static void io_ring_file_ref_flush(struct fixed_file_data *data) while ((node = llist_del_all(&data->put_llist)) != NULL) { llist_for_each_entry_safe(pfile, tmp, node, llist) { io_ring_file_put(data->ctx, pfile->file); - if (pfile->done) - complete(pfile->done); - else - kfree(pfile); + kfree(pfile); } } } @@ -5816,33 +6592,18 @@ static void io_atomic_switch(struct percpu_ref *ref) percpu_ref_get(&data->refs); } -static bool io_queue_file_removal(struct fixed_file_data *data, +static int io_queue_file_removal(struct fixed_file_data *data, struct file *file) { - struct io_file_put *pfile, pfile_stack; - DECLARE_COMPLETION_ONSTACK(done); + struct io_file_put *pfile; - /* - * If we fail allocating the struct we need for doing async reomval - * of this file, just punt to sync and wait for it. - */ pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); - if (!pfile) { - pfile = &pfile_stack; - pfile->done = &done; - } + if (!pfile) + return -ENOMEM; pfile->file = file; llist_add(&pfile->llist, &data->put_llist); - - if (pfile == &pfile_stack) { - percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); - wait_for_completion(&done); - flush_work(&data->ref_work); - return false; - } - - return true; + return 0; } static int __io_sqe_files_update(struct io_ring_ctx *ctx, @@ -5877,9 +6638,11 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, index = i & IORING_FILE_TABLE_MASK; if (table->files[index]) { file = io_file_from_index(ctx, index); + err = io_queue_file_removal(data, file); + if (err) + break; table->files[index] = NULL; - if (io_queue_file_removal(data, file)) - ref_switch = true; + ref_switch = true; } if (fd != -1) { file = fget(fd); @@ -5932,20 +6695,14 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, return __io_sqe_files_update(ctx, &up, nr_args); } -static void io_put_work(struct io_wq_work *work) +static void io_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + /* Consider that io_steal_work() relies on this ref */ io_put_req(req); } -static void io_get_work(struct io_wq_work *work) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - refcount_inc(&req->refs); -} - static int io_init_wq_offload(struct io_ring_ctx *ctx, struct io_uring_params *p) { @@ -5956,8 +6713,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx, int ret = 0; data.user = ctx->user; - data.get_work = io_get_work; - data.put_work = io_put_work; + data.free_work = io_free_work; if (!(p->flags & IORING_SETUP_ATTACH_WQ)) { /* Do QD, or 4 * CPUS, whatever is smallest */ @@ -6359,6 +7115,21 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) return -ENXIO; } +static int __io_destroy_buffers(int id, void *p, void *data) +{ + struct io_ring_ctx *ctx = data; + struct io_buffer *buf = p; + + __io_remove_buffers(ctx, buf, id, -1U); + return 0; +} + +static void io_destroy_buffers(struct io_ring_ctx *ctx) +{ + idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx); + idr_destroy(&ctx->io_buffer_idr); +} + static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); @@ -6369,6 +7140,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); + io_destroy_buffers(ctx); idr_destroy(&ctx->personality_idr); #if defined(CONFIG_UNIX) @@ -6623,6 +7395,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, int submitted = 0; struct fd f; + if (current->task_works) + task_work_run(); + if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) return -EINVAL; @@ -6669,7 +7444,14 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, min_complete = min(min_complete, ctx->cq_entries); - if (ctx->flags & IORING_SETUP_IOPOLL) { + /* + * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user + * space applications don't need to do io completion events + * polling again, they can rely on io_sq_thread to do polling + * work, which can reduce cpu usage and uring_lock contention. + */ + if (ctx->flags & IORING_SETUP_IOPOLL && + !(ctx->flags & IORING_SETUP_SQPOLL)) { ret = io_iopoll_check(ctx, &nr_events, min_complete); } else { ret = io_cqring_wait(ctx, min_complete, sig, sigsz); @@ -6745,6 +7527,17 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "Personalities:\n"); idr_for_each(&ctx->personality_idr, io_uring_show_cred, m); } + seq_printf(m, "PollList:\n"); + spin_lock_irq(&ctx->completion_lock); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list = &ctx->cancel_hash[i]; + struct io_kiocb *req; + + hlist_for_each_entry(req, list, hash_node) + seq_printf(m, " op=%d, task_works=%d\n", req->opcode, + req->task->task_works != NULL); + } + spin_unlock_irq(&ctx->completion_lock); mutex_unlock(&ctx->uring_lock); } @@ -6961,7 +7754,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY; + IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: @@ -7239,6 +8032,7 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(8, __u64, off); BUILD_BUG_SQE_ELEM(8, __u64, addr2); BUILD_BUG_SQE_ELEM(16, __u64, addr); + BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); BUILD_BUG_SQE_ELEM(24, __u32, len); BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); @@ -7253,11 +8047,14 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(28, __u32, open_flags); BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); + BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); BUILD_BUG_SQE_ELEM(32, __u64, user_data); BUILD_BUG_SQE_ELEM(40, __u16, buf_index); BUILD_BUG_SQE_ELEM(42, __u16, personality); + BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); + BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int)); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); return 0; }; diff --git a/fs/splice.c b/fs/splice.c index d671936d0aad..4735defc46ee 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1109,9 +1109,9 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, /* * Determine where to splice to/from. */ -static long do_splice(struct file *in, loff_t __user *off_in, - struct file *out, loff_t __user *off_out, - size_t len, unsigned int flags) +long do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; diff --git a/include/linux/socket.h b/include/linux/socket.h index 15f3412d481e..54338fac45cb 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -391,6 +391,10 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct sockaddr __user **uaddr, struct iovec **iov); +extern int __copy_msghdr_from_user(struct msghdr *kmsg, + struct user_msghdr __user *umsg, + struct sockaddr __user **save_addr, + struct iovec __user **uiov, size_t *nsegs); /* helpers which do the actual work for syscalls */ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, diff --git a/include/linux/splice.h b/include/linux/splice.h index 74b4911ac16d..ebbbfea48aa0 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -78,6 +78,9 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *, struct pipe_buffer *); extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, splice_direct_actor *); +extern long do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags); /* * for dynamic pipe sizing diff --git a/include/net/compat.h b/include/net/compat.h index f277653c7e17..e341260642fe 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -38,6 +38,9 @@ struct compat_cmsghdr { #define compat_mmsghdr mmsghdr #endif /* defined(CONFIG_COMPAT) */ +int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg, + struct sockaddr __user **save_addr, compat_uptr_t *ptr, + compat_size_t *len); int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *, struct sockaddr __user **, struct iovec **); struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval); diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 27bd9e4f927b..9f0d3b7d56b0 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -357,6 +357,109 @@ TRACE_EVENT(io_uring_submit_sqe, __entry->force_nonblock, __entry->sq_thread) ); +TRACE_EVENT(io_uring_poll_arm, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events), + + TP_ARGS(ctx, opcode, user_data, mask, events), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + __field( int, mask ) + __field( int, events ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + __entry->mask = mask; + __entry->events = events; + ), + + TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data, + __entry->mask, __entry->events) +); + +TRACE_EVENT(io_uring_poll_wake, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), + + TP_ARGS(ctx, opcode, user_data, mask), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + __field( int, mask ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + __entry->mask = mask; + ), + + TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data, + __entry->mask) +); + +TRACE_EVENT(io_uring_task_add, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), + + TP_ARGS(ctx, opcode, user_data, mask), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + __field( int, mask ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + __entry->mask = mask; + ), + + TP_printk("ring %p, op %d, data 0x%llx, mask %x", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data, + __entry->mask) +); + +TRACE_EVENT(io_uring_task_run, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data), + + TP_ARGS(ctx, opcode, user_data), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + ), + + TP_printk("ring %p, op %d, data 0x%llx", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data) +); + #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 3f7961c1c243..e48d746b8e2a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ /* * Header file for the io_uring interface. * @@ -23,7 +23,10 @@ struct io_uring_sqe { __u64 off; /* offset into file */ __u64 addr2; }; - __u64 addr; /* pointer to buffer or iovecs */ + union { + __u64 addr; /* pointer to buffer or iovecs */ + __u64 splice_off_in; + }; __u32 len; /* buffer size or number of iovecs */ union { __kernel_rwf_t rw_flags; @@ -37,14 +40,21 @@ struct io_uring_sqe { __u32 open_flags; __u32 statx_flags; __u32 fadvise_advice; + __u32 splice_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { struct { - /* index into fixed buffers, if used */ - __u16 buf_index; + /* pack this to avoid bogus arm OABI complaints */ + union { + /* index into fixed buffers, if used */ + __u16 buf_index; + /* for grouped buffer selection */ + __u16 buf_group; + } __attribute__((packed)); /* personality to use, if used */ __u16 personality; + __s32 splice_fd_in; }; __u64 __pad2[3]; }; @@ -56,6 +66,7 @@ enum { IOSQE_IO_LINK_BIT, IOSQE_IO_HARDLINK_BIT, IOSQE_ASYNC_BIT, + IOSQE_BUFFER_SELECT_BIT, }; /* @@ -71,6 +82,8 @@ enum { #define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT) /* always go async */ #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) +/* select buffer from sqe->buf_group */ +#define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT) /* * io_uring_setup() flags @@ -113,6 +126,9 @@ enum { IORING_OP_RECV, IORING_OP_OPENAT2, IORING_OP_EPOLL_CTL, + IORING_OP_SPLICE, + IORING_OP_PROVIDE_BUFFERS, + IORING_OP_REMOVE_BUFFERS, /* this goes last, obviously */ IORING_OP_LAST, @@ -128,6 +144,12 @@ enum { */ #define IORING_TIMEOUT_ABS (1U << 0) +/* + * sqe->splice_flags + * extends splice(2) flags + */ +#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ + /* * IO completion data structure (Completion Queue Entry) */ @@ -137,6 +159,17 @@ struct io_uring_cqe { __u32 flags; }; +/* + * cqe->flags + * + * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID + */ +#define IORING_CQE_F_BUFFER (1U << 0) + +enum { + IORING_CQE_BUFFER_SHIFT = 16, +}; + /* * Magic offsets for the application to mmap the data it needs */ @@ -204,6 +237,7 @@ struct io_uring_params { #define IORING_FEAT_SUBMIT_STABLE (1U << 2) #define IORING_FEAT_RW_CUR_POS (1U << 3) #define IORING_FEAT_CUR_PERSONALITY (1U << 4) +#define IORING_FEAT_FAST_POLL (1U << 5) /* * io_uring_register(2) opcodes and arguments diff --git a/kernel/task_work.c b/kernel/task_work.c index 0fef395662a6..825f28259a19 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -97,16 +97,26 @@ void task_work_run(void) * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ - raw_spin_lock_irq(&task->pi_lock); do { + head = NULL; work = READ_ONCE(task->task_works); - head = !work && (task->flags & PF_EXITING) ? - &work_exited : NULL; + if (!work) { + if (task->flags & PF_EXITING) + head = &work_exited; + else + break; + } } while (cmpxchg(&task->task_works, work, head) != work); - raw_spin_unlock_irq(&task->pi_lock); if (!work) break; + /* + * Synchronize with task_work_cancel(). It can not remove + * the first entry == work, cmpxchg(task_works) must fail. + * But it can remove another entry from the ->next list. + */ + raw_spin_lock_irq(&task->pi_lock); + raw_spin_unlock_irq(&task->pi_lock); do { next = work->next; diff --git a/net/compat.c b/net/compat.c index 47d99c784947..4bed96e84d9a 100644 --- a/net/compat.c +++ b/net/compat.c @@ -33,10 +33,10 @@ #include #include -int get_compat_msghdr(struct msghdr *kmsg, - struct compat_msghdr __user *umsg, - struct sockaddr __user **save_addr, - struct iovec **iov) +int __get_compat_msghdr(struct msghdr *kmsg, + struct compat_msghdr __user *umsg, + struct sockaddr __user **save_addr, + compat_uptr_t *ptr, compat_size_t *len) { struct compat_msghdr msg; ssize_t err; @@ -79,10 +79,26 @@ int get_compat_msghdr(struct msghdr *kmsg, return -EMSGSIZE; kmsg->msg_iocb = NULL; + *ptr = msg.msg_iov; + *len = msg.msg_iovlen; + return 0; +} - err = compat_import_iovec(save_addr ? READ : WRITE, - compat_ptr(msg.msg_iov), msg.msg_iovlen, - UIO_FASTIOV, iov, &kmsg->msg_iter); +int get_compat_msghdr(struct msghdr *kmsg, + struct compat_msghdr __user *umsg, + struct sockaddr __user **save_addr, + struct iovec **iov) +{ + compat_uptr_t ptr; + compat_size_t len; + ssize_t err; + + err = __get_compat_msghdr(kmsg, umsg, save_addr, &ptr, &len); + if (err) + return err; + + err = compat_import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr), + len, UIO_FASTIOV, iov, &kmsg->msg_iter); return err < 0 ? err : 0; } diff --git a/net/socket.c b/net/socket.c index 2eecf1517f76..2dd739fba866 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2228,10 +2228,10 @@ struct used_address { unsigned int name_len; }; -static int copy_msghdr_from_user(struct msghdr *kmsg, - struct user_msghdr __user *umsg, - struct sockaddr __user **save_addr, - struct iovec **iov) +int __copy_msghdr_from_user(struct msghdr *kmsg, + struct user_msghdr __user *umsg, + struct sockaddr __user **save_addr, + struct iovec __user **uiov, size_t *nsegs) { struct user_msghdr msg; ssize_t err; @@ -2273,6 +2273,23 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, return -EMSGSIZE; kmsg->msg_iocb = NULL; + *uiov = msg.msg_iov; + *nsegs = msg.msg_iovlen; + return 0; +} + +static int copy_msghdr_from_user(struct msghdr *kmsg, + struct user_msghdr __user *umsg, + struct sockaddr __user **save_addr, + struct iovec **iov) +{ + struct user_msghdr msg; + ssize_t err; + + err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov, + &msg.msg_iovlen); + if (err) + return err; err = import_iovec(save_addr ? READ : WRITE, msg.msg_iov, msg.msg_iovlen,