for-5.15/io_uring-2021-08-30

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmEs7tIQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpoR2EACdOPj0tivXWufgFnQyQYPpX4/Qe3lNw608
 MLJ9/zshPFe5kx+SnHzT3UucHd2LO4C68pNVEHdHoi1gqMnMe9+Du82LJlo+Cx9i
 53yiWxNiY7er3t3lC2nvaPF0BPKiaUKaRzqugOTfdWmKLP3MyYyygEBrRDkaK1S1
 BjVq2ewmKTYf63gHkluHeRTav9KxcLWvSqgFUC8Y0mNhazTSdzGB6MAPFodpsuj1
 Vv8ytCiagp9Gi0AibvS2mZvV/WxQtFP7qBofbnG3KcgKgOU+XKTJCH+cU6E/3J9Q
 nPy1loh2TISOtYAz2scypAbwsK4FWeAHg1SaIj/RtUGKG7zpVU5u97CPZ8+UfHzu
 CuR3a1o36Cck8+ZdZIjtRZfvQGog0Dh5u4ZQ4dRwFQd6FiVxdO8LHkegqkV6PStc
 dVrHSo5kUE5hGT8ed1YFfuOSJDZ6w0/LleZGMdU4pRGGs8wqkerZlfLL4ustSfLk
 AcS+azmG2f3iI5iadnxOUNeOT2lmE84fvvLyH2krfsA3AtX0CtHXQcLYAguRAIwg
 gnvYf70JOya6Lb/hbXUi8d8h3uXxeFsoitz1QyasqAEoJoY05EW+l94NbiEzXwol
 mKrVfZCk+wuhw3npbVCqK7nkepvBWso1qTip//T0HDAaKaZnZcmhobUn5SwI+QPx
 fcgAo6iw8g==
 =WBzF
 -----END PGP SIGNATURE-----

Merge tag 'for-5.15/io_uring-2021-08-30' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:

 - cancellation cleanups (Hao, Pavel)

 - io-wq accounting cleanup (Hao)

 - io_uring submit locking fix (Hao)

 - io_uring link handling fixes (Hao)

 - fixed file improvements (wangyangbo, Pavel)

 - allow updates of linked timeouts like regular timeouts (Pavel)

 - IOPOLL fix (Pavel)

 - remove batched file get optimization (Pavel)

 - improve reference handling (Pavel)

 - IRQ task_work batching (Pavel)

 - allow pure fixed file, and add support for open/accept (Pavel)

 - GFP_ATOMIC RT kernel fix

 - multiple CQ ring waiter improvement

 - funnel IRQ completions through task_work

 - add support for limiting async workers explicitly

 - add different clocksource support for timeouts

 - io-wq wakeup race fix

 - lots of cleanups and improvement (Pavel et al)

* tag 'for-5.15/io_uring-2021-08-30' of git://git.kernel.dk/linux-block: (87 commits)
  io-wq: fix wakeup race when adding new work
  io-wq: wqe and worker locks no longer need to be IRQ safe
  io-wq: check max_worker limits if a worker transitions bound state
  io_uring: allow updating linked timeouts
  io_uring: keep ltimeouts in a list
  io_uring: support CLOCK_BOOTTIME/REALTIME for timeouts
  io-wq: provide a way to limit max number of workers
  io_uring: add build check for buf_index overflows
  io_uring: clarify io_req_task_cancel() locking
  io_uring: add task-refs-get helper
  io_uring: fix failed linkchain code logic
  io_uring: remove redundant req_set_fail()
  io_uring: don't free request to slab
  io_uring: accept directly into fixed file table
  io_uring: hand code io_accept() fd installing
  io_uring: openat directly into fixed fd table
  net: add accept helper not installing fd
  io_uring: fix io_try_cancel_userdata race for iowq
  io_uring: IRQ rw completion batching
  io_uring: batch task work locking
  ...
This commit is contained in:
Linus Torvalds 2021-08-30 19:22:52 -07:00
commit c547d89a9a
8 changed files with 1227 additions and 856 deletions

View File

@ -51,6 +51,10 @@ struct io_worker {
struct completion ref_done; struct completion ref_done;
unsigned long create_state;
struct callback_head create_work;
int create_index;
struct rcu_head rcu; struct rcu_head rcu;
}; };
@ -174,7 +178,7 @@ static void io_worker_exit(struct io_worker *worker)
complete(&worker->ref_done); complete(&worker->ref_done);
wait_for_completion(&worker->ref_done); wait_for_completion(&worker->ref_done);
raw_spin_lock_irq(&wqe->lock); raw_spin_lock(&wqe->lock);
if (worker->flags & IO_WORKER_F_FREE) if (worker->flags & IO_WORKER_F_FREE)
hlist_nulls_del_rcu(&worker->nulls_node); hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list); list_del_rcu(&worker->all_list);
@ -184,7 +188,7 @@ static void io_worker_exit(struct io_worker *worker)
worker->flags = 0; worker->flags = 0;
current->flags &= ~PF_IO_WORKER; current->flags &= ~PF_IO_WORKER;
preempt_enable(); preempt_enable();
raw_spin_unlock_irq(&wqe->lock); raw_spin_unlock(&wqe->lock);
kfree_rcu(worker, rcu); kfree_rcu(worker, rcu);
io_worker_ref_put(wqe->wq); io_worker_ref_put(wqe->wq);
@ -250,19 +254,20 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
if (!ret) { if (!ret) {
bool do_create = false, first = false; bool do_create = false, first = false;
raw_spin_lock_irq(&wqe->lock); raw_spin_lock(&wqe->lock);
if (acct->nr_workers < acct->max_workers) { if (acct->nr_workers < acct->max_workers) {
atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs);
if (!acct->nr_workers) if (!acct->nr_workers)
first = true; first = true;
acct->nr_workers++; acct->nr_workers++;
do_create = true; do_create = true;
} }
raw_spin_unlock_irq(&wqe->lock); raw_spin_unlock(&wqe->lock);
if (do_create) if (do_create) {
atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs);
create_io_worker(wqe->wq, wqe, acct->index, first); create_io_worker(wqe->wq, wqe, acct->index, first);
} }
}
} }
static void io_wqe_inc_running(struct io_worker *worker) static void io_wqe_inc_running(struct io_worker *worker)
@ -272,60 +277,63 @@ static void io_wqe_inc_running(struct io_worker *worker)
atomic_inc(&acct->nr_running); atomic_inc(&acct->nr_running);
} }
struct create_worker_data {
struct callback_head work;
struct io_wqe *wqe;
int index;
};
static void create_worker_cb(struct callback_head *cb) static void create_worker_cb(struct callback_head *cb)
{ {
struct create_worker_data *cwd; struct io_worker *worker;
struct io_wq *wq; struct io_wq *wq;
struct io_wqe *wqe; struct io_wqe *wqe;
struct io_wqe_acct *acct; struct io_wqe_acct *acct;
bool do_create = false, first = false; bool do_create = false, first = false;
cwd = container_of(cb, struct create_worker_data, work); worker = container_of(cb, struct io_worker, create_work);
wqe = cwd->wqe; wqe = worker->wqe;
wq = wqe->wq; wq = wqe->wq;
acct = &wqe->acct[cwd->index]; acct = &wqe->acct[worker->create_index];
raw_spin_lock_irq(&wqe->lock); raw_spin_lock(&wqe->lock);
if (acct->nr_workers < acct->max_workers) { if (acct->nr_workers < acct->max_workers) {
if (!acct->nr_workers) if (!acct->nr_workers)
first = true; first = true;
acct->nr_workers++; acct->nr_workers++;
do_create = true; do_create = true;
} }
raw_spin_unlock_irq(&wqe->lock); raw_spin_unlock(&wqe->lock);
if (do_create) { if (do_create) {
create_io_worker(wq, wqe, cwd->index, first); create_io_worker(wq, wqe, worker->create_index, first);
} else { } else {
atomic_dec(&acct->nr_running); atomic_dec(&acct->nr_running);
io_worker_ref_put(wq); io_worker_ref_put(wq);
} }
kfree(cwd); clear_bit_unlock(0, &worker->create_state);
io_worker_release(worker);
} }
static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct) static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
struct io_wqe_acct *acct)
{ {
struct create_worker_data *cwd;
struct io_wq *wq = wqe->wq; struct io_wq *wq = wqe->wq;
/* raced with exit, just ignore create call */ /* raced with exit, just ignore create call */
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
goto fail; goto fail;
if (!io_worker_get(worker))
goto fail;
/*
* create_state manages ownership of create_work/index. We should
* only need one entry per worker, as the worker going to sleep
* will trigger the condition, and waking will clear it once it
* runs the task_work.
*/
if (test_bit(0, &worker->create_state) ||
test_and_set_bit_lock(0, &worker->create_state))
goto fail_release;
cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC); init_task_work(&worker->create_work, create_worker_cb);
if (cwd) { worker->create_index = acct->index;
init_task_work(&cwd->work, create_worker_cb); if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
cwd->wqe = wqe;
cwd->index = acct->index;
if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
return; return;
clear_bit_unlock(0, &worker->create_state);
kfree(cwd); fail_release:
} io_worker_release(worker);
fail: fail:
atomic_dec(&acct->nr_running); atomic_dec(&acct->nr_running);
io_worker_ref_put(wq); io_worker_ref_put(wq);
@ -343,7 +351,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) { if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
atomic_inc(&acct->nr_running); atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs); atomic_inc(&wqe->wq->worker_refs);
io_queue_worker_create(wqe, acct); io_queue_worker_create(wqe, worker, acct);
} }
} }
@ -416,7 +424,28 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
spin_unlock(&wq->hash->wait.lock); spin_unlock(&wq->hash->wait.lock);
} }
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) /*
* We can always run the work if the worker is currently the same type as
* the work (eg both are bound, or both are unbound). If they are not the
* same, only allow it if incrementing the worker count would be allowed.
*/
static bool io_worker_can_run_work(struct io_worker *worker,
struct io_wq_work *work)
{
struct io_wqe_acct *acct;
if (!(worker->flags & IO_WORKER_F_BOUND) !=
!(work->flags & IO_WQ_WORK_UNBOUND))
return true;
/* not the same type, check if we'd go over the limit */
acct = io_work_get_acct(worker->wqe, work);
return acct->nr_workers < acct->max_workers;
}
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
struct io_worker *worker,
bool *stalled)
__must_hold(wqe->lock) __must_hold(wqe->lock)
{ {
struct io_wq_work_node *node, *prev; struct io_wq_work_node *node, *prev;
@ -428,6 +457,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
work = container_of(node, struct io_wq_work, list); work = container_of(node, struct io_wq_work, list);
if (!io_worker_can_run_work(worker, work))
break;
/* not hashed, can run anytime */ /* not hashed, can run anytime */
if (!io_wq_is_hashed(work)) { if (!io_wq_is_hashed(work)) {
wq_list_del(&wqe->work_list, node, prev); wq_list_del(&wqe->work_list, node, prev);
@ -454,6 +486,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
raw_spin_unlock(&wqe->lock); raw_spin_unlock(&wqe->lock);
io_wait_on_hash(wqe, stall_hash); io_wait_on_hash(wqe, stall_hash);
raw_spin_lock(&wqe->lock); raw_spin_lock(&wqe->lock);
*stalled = true;
} }
return NULL; return NULL;
@ -477,9 +510,9 @@ static void io_assign_current_work(struct io_worker *worker,
cond_resched(); cond_resched();
} }
spin_lock_irq(&worker->lock); spin_lock(&worker->lock);
worker->cur_work = work; worker->cur_work = work;
spin_unlock_irq(&worker->lock); spin_unlock(&worker->lock);
} }
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
@ -493,6 +526,7 @@ static void io_worker_handle_work(struct io_worker *worker)
do { do {
struct io_wq_work *work; struct io_wq_work *work;
bool stalled;
get_next: get_next:
/* /*
* If we got some work, mark us as busy. If we didn't, but * If we got some work, mark us as busy. If we didn't, but
@ -501,13 +535,14 @@ get_next:
* can't make progress, any work completion or insertion will * can't make progress, any work completion or insertion will
* clear the stalled flag. * clear the stalled flag.
*/ */
work = io_get_next_work(wqe); stalled = false;
work = io_get_next_work(wqe, worker, &stalled);
if (work) if (work)
__io_worker_busy(wqe, worker, work); __io_worker_busy(wqe, worker, work);
else if (!wq_list_empty(&wqe->work_list)) else if (stalled)
wqe->flags |= IO_WQE_FLAG_STALLED; wqe->flags |= IO_WQE_FLAG_STALLED;
raw_spin_unlock_irq(&wqe->lock); raw_spin_unlock(&wqe->lock);
if (!work) if (!work)
break; break;
io_assign_current_work(worker, work); io_assign_current_work(worker, work);
@ -539,16 +574,16 @@ get_next:
clear_bit(hash, &wq->hash->map); clear_bit(hash, &wq->hash->map);
if (wq_has_sleeper(&wq->hash->wait)) if (wq_has_sleeper(&wq->hash->wait))
wake_up(&wq->hash->wait); wake_up(&wq->hash->wait);
raw_spin_lock_irq(&wqe->lock); raw_spin_lock(&wqe->lock);
wqe->flags &= ~IO_WQE_FLAG_STALLED; wqe->flags &= ~IO_WQE_FLAG_STALLED;
/* skip unnecessary unlock-lock wqe->lock */ /* skip unnecessary unlock-lock wqe->lock */
if (!work) if (!work)
goto get_next; goto get_next;
raw_spin_unlock_irq(&wqe->lock); raw_spin_unlock(&wqe->lock);
} }
} while (work); } while (work);
raw_spin_lock_irq(&wqe->lock); raw_spin_lock(&wqe->lock);
} while (1); } while (1);
} }
@ -569,13 +604,13 @@ static int io_wqe_worker(void *data)
set_current_state(TASK_INTERRUPTIBLE); set_current_state(TASK_INTERRUPTIBLE);
loop: loop:
raw_spin_lock_irq(&wqe->lock); raw_spin_lock(&wqe->lock);
if (io_wqe_run_queue(wqe)) { if (io_wqe_run_queue(wqe)) {
io_worker_handle_work(worker); io_worker_handle_work(worker);
goto loop; goto loop;
} }
__io_worker_idle(wqe, worker); __io_worker_idle(wqe, worker);
raw_spin_unlock_irq(&wqe->lock); raw_spin_unlock(&wqe->lock);
if (io_flush_signals()) if (io_flush_signals())
continue; continue;
ret = schedule_timeout(WORKER_IDLE_TIMEOUT); ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
@ -594,7 +629,7 @@ loop:
} }
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
raw_spin_lock_irq(&wqe->lock); raw_spin_lock(&wqe->lock);
io_worker_handle_work(worker); io_worker_handle_work(worker);
} }
@ -636,9 +671,9 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
worker->flags &= ~IO_WORKER_F_RUNNING; worker->flags &= ~IO_WORKER_F_RUNNING;
raw_spin_lock_irq(&worker->wqe->lock); raw_spin_lock(&worker->wqe->lock);
io_wqe_dec_running(worker); io_wqe_dec_running(worker);
raw_spin_unlock_irq(&worker->wqe->lock); raw_spin_unlock(&worker->wqe->lock);
} }
static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first) static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first)
@ -664,9 +699,9 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bo
kfree(worker); kfree(worker);
fail: fail:
atomic_dec(&acct->nr_running); atomic_dec(&acct->nr_running);
raw_spin_lock_irq(&wqe->lock); raw_spin_lock(&wqe->lock);
acct->nr_workers--; acct->nr_workers--;
raw_spin_unlock_irq(&wqe->lock); raw_spin_unlock(&wqe->lock);
io_worker_ref_put(wq); io_worker_ref_put(wq);
return; return;
} }
@ -676,7 +711,7 @@ fail:
set_cpus_allowed_ptr(tsk, wqe->cpu_mask); set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
tsk->flags |= PF_NO_SETAFFINITY; tsk->flags |= PF_NO_SETAFFINITY;
raw_spin_lock_irq(&wqe->lock); raw_spin_lock(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
list_add_tail_rcu(&worker->all_list, &wqe->all_list); list_add_tail_rcu(&worker->all_list, &wqe->all_list);
worker->flags |= IO_WORKER_F_FREE; worker->flags |= IO_WORKER_F_FREE;
@ -684,7 +719,7 @@ fail:
worker->flags |= IO_WORKER_F_BOUND; worker->flags |= IO_WORKER_F_BOUND;
if (first && (worker->flags & IO_WORKER_F_BOUND)) if (first && (worker->flags & IO_WORKER_F_BOUND))
worker->flags |= IO_WORKER_F_FIXED; worker->flags |= IO_WORKER_F_FIXED;
raw_spin_unlock_irq(&wqe->lock); raw_spin_unlock(&wqe->lock);
wake_up_new_task(tsk); wake_up_new_task(tsk);
} }
@ -759,8 +794,7 @@ append:
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{ {
struct io_wqe_acct *acct = io_work_get_acct(wqe, work); struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
int work_flags; bool do_wake;
unsigned long flags;
/* /*
* If io-wq is exiting for this task, or if the request has explicitly * If io-wq is exiting for this task, or if the request has explicitly
@ -772,14 +806,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return; return;
} }
work_flags = work->flags; raw_spin_lock(&wqe->lock);
raw_spin_lock_irqsave(&wqe->lock, flags);
io_wqe_insert_work(wqe, work); io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED; wqe->flags &= ~IO_WQE_FLAG_STALLED;
raw_spin_unlock_irqrestore(&wqe->lock, flags); do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) ||
!atomic_read(&acct->nr_running);
raw_spin_unlock(&wqe->lock);
if ((work_flags & IO_WQ_WORK_CONCURRENT) || if (do_wake)
!atomic_read(&acct->nr_running))
io_wqe_wake_worker(wqe, acct); io_wqe_wake_worker(wqe, acct);
} }
@ -805,19 +839,18 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
static bool io_wq_worker_cancel(struct io_worker *worker, void *data) static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{ {
struct io_cb_cancel_data *match = data; struct io_cb_cancel_data *match = data;
unsigned long flags;
/* /*
* Hold the lock to avoid ->cur_work going out of scope, caller * Hold the lock to avoid ->cur_work going out of scope, caller
* may dereference the passed in work. * may dereference the passed in work.
*/ */
spin_lock_irqsave(&worker->lock, flags); spin_lock(&worker->lock);
if (worker->cur_work && if (worker->cur_work &&
match->fn(worker->cur_work, match->data)) { match->fn(worker->cur_work, match->data)) {
set_notify_signal(worker->task); set_notify_signal(worker->task);
match->nr_running++; match->nr_running++;
} }
spin_unlock_irqrestore(&worker->lock, flags); spin_unlock(&worker->lock);
return match->nr_running && !match->cancel_all; return match->nr_running && !match->cancel_all;
} }
@ -845,16 +878,15 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
{ {
struct io_wq_work_node *node, *prev; struct io_wq_work_node *node, *prev;
struct io_wq_work *work; struct io_wq_work *work;
unsigned long flags;
retry: retry:
raw_spin_lock_irqsave(&wqe->lock, flags); raw_spin_lock(&wqe->lock);
wq_list_for_each(node, prev, &wqe->work_list) { wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list); work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data)) if (!match->fn(work, match->data))
continue; continue;
io_wqe_remove_pending(wqe, work, prev); io_wqe_remove_pending(wqe, work, prev);
raw_spin_unlock_irqrestore(&wqe->lock, flags); raw_spin_unlock(&wqe->lock);
io_run_cancel(work, wqe); io_run_cancel(work, wqe);
match->nr_pending++; match->nr_pending++;
if (!match->cancel_all) if (!match->cancel_all)
@ -863,7 +895,7 @@ retry:
/* not safe to continue after unlock */ /* not safe to continue after unlock */
goto retry; goto retry;
} }
raw_spin_unlock_irqrestore(&wqe->lock, flags); raw_spin_unlock(&wqe->lock);
} }
static void io_wqe_cancel_running_work(struct io_wqe *wqe, static void io_wqe_cancel_running_work(struct io_wqe *wqe,
@ -1004,12 +1036,12 @@ err_wq:
static bool io_task_work_match(struct callback_head *cb, void *data) static bool io_task_work_match(struct callback_head *cb, void *data)
{ {
struct create_worker_data *cwd; struct io_worker *worker;
if (cb->func != create_worker_cb) if (cb->func != create_worker_cb)
return false; return false;
cwd = container_of(cb, struct create_worker_data, work); worker = container_of(cb, struct io_worker, create_work);
return cwd->wqe->wq == data; return worker->wqe->wq == data;
} }
void io_wq_exit_start(struct io_wq *wq) void io_wq_exit_start(struct io_wq *wq)
@ -1026,12 +1058,13 @@ static void io_wq_exit_workers(struct io_wq *wq)
return; return;
while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) { while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
struct create_worker_data *cwd; struct io_worker *worker;
cwd = container_of(cb, struct create_worker_data, work); worker = container_of(cb, struct io_worker, create_work);
atomic_dec(&cwd->wqe->acct[cwd->index].nr_running); atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
io_worker_ref_put(wq); io_worker_ref_put(wq);
kfree(cwd); clear_bit_unlock(0, &worker->create_state);
io_worker_release(worker);
} }
rcu_read_lock(); rcu_read_lock();
@ -1143,6 +1176,35 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
return 0; return 0;
} }
/*
* Set max number of unbounded workers, returns old value. If new_count is 0,
* then just return the old value.
*/
int io_wq_max_workers(struct io_wq *wq, int *new_count)
{
int i, node, prev = 0;
for (i = 0; i < 2; i++) {
if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
new_count[i] = task_rlimit(current, RLIMIT_NPROC);
}
rcu_read_lock();
for_each_node(node) {
struct io_wqe_acct *acct;
for (i = 0; i < 2; i++) {
acct = &wq->wqes[node]->acct[i];
prev = max_t(int, acct->max_workers, prev);
if (new_count[i])
acct->max_workers = new_count[i];
new_count[i] = prev;
}
}
rcu_read_unlock();
return 0;
}
static __init int io_wq_init(void) static __init int io_wq_init(void)
{ {
int ret; int ret;

View File

@ -44,6 +44,7 @@ static inline void wq_list_add_after(struct io_wq_work_node *node,
static inline void wq_list_add_tail(struct io_wq_work_node *node, static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list) struct io_wq_work_list *list)
{ {
node->next = NULL;
if (!list->first) { if (!list->first) {
list->last = node; list->last = node;
WRITE_ONCE(list->first, node); WRITE_ONCE(list->first, node);
@ -51,7 +52,6 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
list->last->next = node; list->last->next = node;
list->last = node; list->last = node;
} }
node->next = NULL;
} }
static inline void wq_list_cut(struct io_wq_work_list *list, static inline void wq_list_cut(struct io_wq_work_list *list,
@ -128,6 +128,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val); void io_wq_hash_work(struct io_wq_work *work, void *val);
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count);
static inline bool io_wq_is_hashed(struct io_wq_work *work) static inline bool io_wq_is_hashed(struct io_wq_work *work)
{ {

File diff suppressed because it is too large Load Diff

View File

@ -7,17 +7,18 @@
#if defined(CONFIG_IO_URING) #if defined(CONFIG_IO_URING)
struct sock *io_uring_get_socket(struct file *file); struct sock *io_uring_get_socket(struct file *file);
void __io_uring_cancel(struct files_struct *files); void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk); void __io_uring_free(struct task_struct *tsk);
static inline void io_uring_files_cancel(struct files_struct *files) static inline void io_uring_files_cancel(void)
{ {
if (current->io_uring) if (current->io_uring)
__io_uring_cancel(files); __io_uring_cancel(false);
} }
static inline void io_uring_task_cancel(void) static inline void io_uring_task_cancel(void)
{ {
return io_uring_files_cancel(NULL); if (current->io_uring)
__io_uring_cancel(true);
} }
static inline void io_uring_free(struct task_struct *tsk) static inline void io_uring_free(struct task_struct *tsk)
{ {
@ -32,7 +33,7 @@ static inline struct sock *io_uring_get_socket(struct file *file)
static inline void io_uring_task_cancel(void) static inline void io_uring_task_cancel(void)
{ {
} }
static inline void io_uring_files_cancel(struct files_struct *files) static inline void io_uring_files_cancel(void)
{ {
} }
static inline void io_uring_free(struct task_struct *tsk) static inline void io_uring_free(struct task_struct *tsk)

View File

@ -421,6 +421,9 @@ extern int __sys_accept4_file(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr, struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags, int __user *upeer_addrlen, int flags,
unsigned long nofile); unsigned long nofile);
extern struct file *do_accept(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags);
extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags); int __user *upeer_addrlen, int flags);
extern int __sys_socket(int family, int type, int protocol); extern int __sys_socket(int family, int type, int protocol);

View File

@ -55,7 +55,10 @@ struct io_uring_sqe {
} __attribute__((packed)); } __attribute__((packed));
/* personality to use, if used */ /* personality to use, if used */
__u16 personality; __u16 personality;
union {
__s32 splice_fd_in; __s32 splice_fd_in;
__u32 file_index;
};
__u64 __pad2[2]; __u64 __pad2[2];
}; };
@ -148,7 +151,11 @@ enum {
*/ */
#define IORING_TIMEOUT_ABS (1U << 0) #define IORING_TIMEOUT_ABS (1U << 0)
#define IORING_TIMEOUT_UPDATE (1U << 1) #define IORING_TIMEOUT_UPDATE (1U << 1)
#define IORING_TIMEOUT_BOOTTIME (1U << 2)
#define IORING_TIMEOUT_REALTIME (1U << 3)
#define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/* /*
* sqe->splice_flags * sqe->splice_flags
* extends splice(2) flags * extends splice(2) flags
@ -306,6 +313,9 @@ enum {
IORING_REGISTER_IOWQ_AFF = 17, IORING_REGISTER_IOWQ_AFF = 17,
IORING_UNREGISTER_IOWQ_AFF = 18, IORING_UNREGISTER_IOWQ_AFF = 18,
/* set/get max number of workers */
IORING_REGISTER_IOWQ_MAX_WORKERS = 19,
/* this goes last */ /* this goes last */
IORING_REGISTER_LAST IORING_REGISTER_LAST
}; };

View File

@ -777,7 +777,7 @@ void __noreturn do_exit(long code)
schedule(); schedule();
} }
io_uring_files_cancel(tsk->files); io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */ exit_signals(tsk); /* sets PF_EXITING */
/* sync mm's RSS info before statistics gathering */ /* sync mm's RSS info before statistics gathering */

View File

@ -1722,32 +1722,22 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog)
return __sys_listen(fd, backlog); return __sys_listen(fd, backlog);
} }
int __sys_accept4_file(struct file *file, unsigned file_flags, struct file *do_accept(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr, struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags, int __user *upeer_addrlen, int flags)
unsigned long nofile)
{ {
struct socket *sock, *newsock; struct socket *sock, *newsock;
struct file *newfile; struct file *newfile;
int err, len, newfd; int err, len;
struct sockaddr_storage address; struct sockaddr_storage address;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
sock = sock_from_file(file); sock = sock_from_file(file);
if (!sock) { if (!sock)
err = -ENOTSOCK; return ERR_PTR(-ENOTSOCK);
goto out;
}
err = -ENFILE;
newsock = sock_alloc(); newsock = sock_alloc();
if (!newsock) if (!newsock)
goto out; return ERR_PTR(-ENFILE);
newsock->type = sock->type; newsock->type = sock->type;
newsock->ops = sock->ops; newsock->ops = sock->ops;
@ -1758,18 +1748,9 @@ int __sys_accept4_file(struct file *file, unsigned file_flags,
*/ */
__module_get(newsock->ops->owner); __module_get(newsock->ops->owner);
newfd = __get_unused_fd_flags(flags, nofile);
if (unlikely(newfd < 0)) {
err = newfd;
sock_release(newsock);
goto out;
}
newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
if (IS_ERR(newfile)) { if (IS_ERR(newfile))
err = PTR_ERR(newfile); return newfile;
put_unused_fd(newfd);
goto out;
}
err = security_socket_accept(sock, newsock); err = security_socket_accept(sock, newsock);
if (err) if (err)
@ -1794,16 +1775,38 @@ int __sys_accept4_file(struct file *file, unsigned file_flags,
} }
/* File flags are not inherited via accept() unlike another OSes. */ /* File flags are not inherited via accept() unlike another OSes. */
return newfile;
fd_install(newfd, newfile);
err = newfd;
out:
return err;
out_fd: out_fd:
fput(newfile); fput(newfile);
put_unused_fd(newfd); return ERR_PTR(err);
goto out; }
int __sys_accept4_file(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags,
unsigned long nofile)
{
struct file *newfile;
int newfd;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
newfd = __get_unused_fd_flags(flags, nofile);
if (unlikely(newfd < 0))
return newfd;
newfile = do_accept(file, file_flags, upeer_sockaddr, upeer_addrlen,
flags);
if (IS_ERR(newfile)) {
put_unused_fd(newfd);
return PTR_ERR(newfile);
}
fd_install(newfd, newfile);
return newfd;
} }
/* /*