From e5c6828493b5fa6a3c4606b43e80ab6c5ec1111f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Sep 2021 14:11:02 -0300 Subject: [PATCH] futex: Split out requeue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move all the requeue bits into their own file. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: André Almeida Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: André Almeida Link: https://lore.kernel.org/r/20210923171111.300673-14-andrealmeid@collabora.com --- kernel/futex/Makefile | 2 +- kernel/futex/core.c | 966 +---------------------------------------- kernel/futex/futex.h | 77 +++- kernel/futex/requeue.c | 897 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 979 insertions(+), 963 deletions(-) create mode 100644 kernel/futex/requeue.c diff --git a/kernel/futex/Makefile b/kernel/futex/Makefile index 27b71c2e8fa8..c0409419fe8d 100644 --- a/kernel/futex/Makefile +++ b/kernel/futex/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += core.o syscalls.o pi.o +obj-y += core.o syscalls.o pi.o requeue.o diff --git a/kernel/futex/core.c b/kernel/futex/core.c index bcc4aa052f9d..42f2735e9abf 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -147,64 +147,6 @@ int __read_mostly futex_cmpxchg_enabled; #endif -/* - * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an - * underlying rtmutex. The task which is about to be requeued could have - * just woken up (timeout, signal). After the wake up the task has to - * acquire hash bucket lock, which is held by the requeue code. As a task - * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking - * and the hash bucket lock blocking would collide and corrupt state. - * - * On !PREEMPT_RT this is not a problem and everything could be serialized - * on hash bucket lock, but aside of having the benefit of common code, - * this allows to avoid doing the requeue when the task is already on the - * way out and taking the hash bucket lock of the original uaddr1 when the - * requeue has been completed. - * - * The following state transitions are valid: - * - * On the waiter side: - * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE - * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT - * - * On the requeue side: - * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS - * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED - * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed) - * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED - * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed) - * - * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this - * signals that the waiter is already on the way out. It also means that - * the waiter is still on the 'wait' futex, i.e. uaddr1. - * - * The waiter side signals early wakeup to the requeue side either through - * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending - * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately - * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT, - * which means the wakeup is interleaving with a requeue in progress it has - * to wait for the requeue side to change the state. Either to DONE/LOCKED - * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex - * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by - * the requeue side when the requeue attempt failed via deadlock detection - * and therefore the waiter q is still on the uaddr1 futex. - */ -enum { - Q_REQUEUE_PI_NONE = 0, - Q_REQUEUE_PI_IGNORE, - Q_REQUEUE_PI_IN_PROGRESS, - Q_REQUEUE_PI_WAIT, - Q_REQUEUE_PI_DONE, - Q_REQUEUE_PI_LOCKED, -}; - -const struct futex_q futex_q_init = { - /* list gets initialized in futex_queue()*/ - .key = FUTEX_KEY_INIT, - .bitset = FUTEX_BITSET_MATCH_ANY, - .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE), -}; - /* * The base of the bucket array and its size are always used together * (after initialization only in futex_hash()), so ensure that they @@ -269,31 +211,6 @@ late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAIL_FUTEX */ -/* - * Reflects a new waiter being added to the waitqueue. - */ -static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb) -{ -#ifdef CONFIG_SMP - atomic_inc(&hb->waiters); - /* - * Full barrier (A), see the ordering comment above. - */ - smp_mb__after_atomic(); -#endif -} - -/* - * Reflects a waiter being removed from the waitqueue by wakeup - * paths. - */ -static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb) -{ -#ifdef CONFIG_SMP - atomic_dec(&hb->waiters); -#endif -} - static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP @@ -323,21 +240,6 @@ struct futex_hash_bucket *futex_hash(union futex_key *key) } -/** - * futex_match - Check whether two futex keys are equal - * @key1: Pointer to key1 - * @key2: Pointer to key2 - * - * Return 1 if two futex_keys are equal, 0 otherwise. - */ -static inline int futex_match(union futex_key *key1, union futex_key *key2) -{ - return (key1 && key2 - && key1->both.word == key2->both.word - && key1->both.ptr == key2->both.ptr - && key1->both.offset == key2->both.offset); -} - /** * futex_setup_timer - set up the sleeping hrtimer. * @time: ptr to the given timeout value @@ -713,7 +615,7 @@ void wait_for_owner_exiting(int ret, struct task_struct *exiting) * * The q->lock_ptr must not be NULL and must be held by the caller. */ -static void __futex_unqueue(struct futex_q *q) +void __futex_unqueue(struct futex_q *q) { struct futex_hash_bucket *hb; @@ -732,7 +634,7 @@ static void __futex_unqueue(struct futex_q *q) * must ensure to later call wake_up_q() for the actual * wakeups to occur. */ -static void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) +void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) { struct task_struct *p = q->task; @@ -757,30 +659,6 @@ static void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) wake_q_add_safe(wake_q, p); } -/* - * Express the locking dependencies for lockdep: - */ -static inline void -double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) -{ - if (hb1 <= hb2) { - spin_lock(&hb1->lock); - if (hb1 < hb2) - spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); - } else { /* hb1 > hb2 */ - spin_lock(&hb2->lock); - spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); - } -} - -static inline void -double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) -{ - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); -} - /* * Wake up waiters matching bitset queued on this futex (uaddr). */ @@ -961,619 +839,6 @@ out_unlock: return ret; } -/** - * requeue_futex() - Requeue a futex_q from one hb to another - * @q: the futex_q to requeue - * @hb1: the source hash_bucket - * @hb2: the target hash_bucket - * @key2: the new key for the requeued futex_q - */ -static inline -void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, - struct futex_hash_bucket *hb2, union futex_key *key2) -{ - - /* - * If key1 and key2 hash to the same bucket, no need to - * requeue. - */ - if (likely(&hb1->chain != &hb2->chain)) { - plist_del(&q->list, &hb1->chain); - futex_hb_waiters_dec(hb1); - futex_hb_waiters_inc(hb2); - plist_add(&q->list, &hb2->chain); - q->lock_ptr = &hb2->lock; - } - q->key = *key2; -} - -static inline bool futex_requeue_pi_prepare(struct futex_q *q, - struct futex_pi_state *pi_state) -{ - int old, new; - - /* - * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has - * already set Q_REQUEUE_PI_IGNORE to signal that requeue should - * ignore the waiter. - */ - old = atomic_read_acquire(&q->requeue_state); - do { - if (old == Q_REQUEUE_PI_IGNORE) - return false; - - /* - * futex_proxy_trylock_atomic() might have set it to - * IN_PROGRESS and a interleaved early wake to WAIT. - * - * It was considered to have an extra state for that - * trylock, but that would just add more conditionals - * all over the place for a dubious value. - */ - if (old != Q_REQUEUE_PI_NONE) - break; - - new = Q_REQUEUE_PI_IN_PROGRESS; - } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); - - q->pi_state = pi_state; - return true; -} - -static inline void futex_requeue_pi_complete(struct futex_q *q, int locked) -{ - int old, new; - - old = atomic_read_acquire(&q->requeue_state); - do { - if (old == Q_REQUEUE_PI_IGNORE) - return; - - if (locked >= 0) { - /* Requeue succeeded. Set DONE or LOCKED */ - WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS && - old != Q_REQUEUE_PI_WAIT); - new = Q_REQUEUE_PI_DONE + locked; - } else if (old == Q_REQUEUE_PI_IN_PROGRESS) { - /* Deadlock, no early wakeup interleave */ - new = Q_REQUEUE_PI_NONE; - } else { - /* Deadlock, early wakeup interleave. */ - WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT); - new = Q_REQUEUE_PI_IGNORE; - } - } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); - -#ifdef CONFIG_PREEMPT_RT - /* If the waiter interleaved with the requeue let it know */ - if (unlikely(old == Q_REQUEUE_PI_WAIT)) - rcuwait_wake_up(&q->requeue_wait); -#endif -} - -static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) -{ - int old, new; - - old = atomic_read_acquire(&q->requeue_state); - do { - /* Is requeue done already? */ - if (old >= Q_REQUEUE_PI_DONE) - return old; - - /* - * If not done, then tell the requeue code to either ignore - * the waiter or to wake it up once the requeue is done. - */ - new = Q_REQUEUE_PI_WAIT; - if (old == Q_REQUEUE_PI_NONE) - new = Q_REQUEUE_PI_IGNORE; - } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); - - /* If the requeue was in progress, wait for it to complete */ - if (old == Q_REQUEUE_PI_IN_PROGRESS) { -#ifdef CONFIG_PREEMPT_RT - rcuwait_wait_event(&q->requeue_wait, - atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT, - TASK_UNINTERRUPTIBLE); -#else - (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT); -#endif - } - - /* - * Requeue is now either prohibited or complete. Reread state - * because during the wait above it might have changed. Nothing - * will modify q->requeue_state after this point. - */ - return atomic_read(&q->requeue_state); -} - -/** - * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue - * @q: the futex_q - * @key: the key of the requeue target futex - * @hb: the hash_bucket of the requeue target futex - * - * During futex_requeue, with requeue_pi=1, it is possible to acquire the - * target futex if it is uncontended or via a lock steal. - * - * 1) Set @q::key to the requeue target futex key so the waiter can detect - * the wakeup on the right futex. - * - * 2) Dequeue @q from the hash bucket. - * - * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock - * acquisition. - * - * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that - * the waiter has to fixup the pi state. - * - * 5) Complete the requeue state so the waiter can make progress. After - * this point the waiter task can return from the syscall immediately in - * case that the pi state does not have to be fixed up. - * - * 6) Wake the waiter task. - * - * Must be called with both q->lock_ptr and hb->lock held. - */ -static inline -void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, - struct futex_hash_bucket *hb) -{ - q->key = *key; - - __futex_unqueue(q); - - WARN_ON(!q->rt_waiter); - q->rt_waiter = NULL; - - q->lock_ptr = &hb->lock; - - /* Signal locked state to the waiter */ - futex_requeue_pi_complete(q, 1); - wake_up_state(q->task, TASK_NORMAL); -} - -/** - * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter - * @pifutex: the user address of the to futex - * @hb1: the from futex hash bucket, must be locked by the caller - * @hb2: the to futex hash bucket, must be locked by the caller - * @key1: the from futex key - * @key2: the to futex key - * @ps: address to store the pi_state pointer - * @exiting: Pointer to store the task pointer of the owner task - * which is in the middle of exiting - * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) - * - * Try and get the lock on behalf of the top waiter if we can do it atomically. - * Wake the top waiter if we succeed. If the caller specified set_waiters, - * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. - * hb1 and hb2 must be held by the caller. - * - * @exiting is only set when the return value is -EBUSY. If so, this holds - * a refcount on the exiting task on return and the caller needs to drop it - * after waiting for the exit to complete. - * - * Return: - * - 0 - failed to acquire the lock atomically; - * - >0 - acquired the lock, return value is vpid of the top_waiter - * - <0 - error - */ -static int -futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, - struct futex_hash_bucket *hb2, union futex_key *key1, - union futex_key *key2, struct futex_pi_state **ps, - struct task_struct **exiting, int set_waiters) -{ - struct futex_q *top_waiter = NULL; - u32 curval; - int ret; - - if (futex_get_value_locked(&curval, pifutex)) - return -EFAULT; - - if (unlikely(should_fail_futex(true))) - return -EFAULT; - - /* - * Find the top_waiter and determine if there are additional waiters. - * If the caller intends to requeue more than 1 waiter to pifutex, - * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, - * as we have means to handle the possible fault. If not, don't set - * the bit unnecessarily as it will force the subsequent unlock to enter - * the kernel. - */ - top_waiter = futex_top_waiter(hb1, key1); - - /* There are no waiters, nothing for us to do. */ - if (!top_waiter) - return 0; - - /* - * Ensure that this is a waiter sitting in futex_wait_requeue_pi() - * and waiting on the 'waitqueue' futex which is always !PI. - */ - if (!top_waiter->rt_waiter || top_waiter->pi_state) - return -EINVAL; - - /* Ensure we requeue to the expected futex. */ - if (!futex_match(top_waiter->requeue_pi_key, key2)) - return -EINVAL; - - /* Ensure that this does not race against an early wakeup */ - if (!futex_requeue_pi_prepare(top_waiter, NULL)) - return -EAGAIN; - - /* - * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit - * in the contended case or if @set_waiters is true. - * - * In the contended case PI state is attached to the lock owner. If - * the user space lock can be acquired then PI state is attached to - * the new owner (@top_waiter->task) when @set_waiters is true. - */ - ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, - exiting, set_waiters); - if (ret == 1) { - /* - * Lock was acquired in user space and PI state was - * attached to @top_waiter->task. That means state is fully - * consistent and the waiter can return to user space - * immediately after the wakeup. - */ - requeue_pi_wake_futex(top_waiter, key2, hb2); - } else if (ret < 0) { - /* Rewind top_waiter::requeue_state */ - futex_requeue_pi_complete(top_waiter, ret); - } else { - /* - * futex_lock_pi_atomic() did not acquire the user space - * futex, but managed to establish the proxy lock and pi - * state. top_waiter::requeue_state cannot be fixed up here - * because the waiter is not enqueued on the rtmutex - * yet. This is handled at the callsite depending on the - * result of rt_mutex_start_proxy_lock() which is - * guaranteed to be reached with this function returning 0. - */ - } - return ret; -} - -/** - * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 - * @uaddr1: source futex user address - * @flags: futex flags (FLAGS_SHARED, etc.) - * @uaddr2: target futex user address - * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) - * @nr_requeue: number of waiters to requeue (0-INT_MAX) - * @cmpval: @uaddr1 expected value (or %NULL) - * @requeue_pi: if we are attempting to requeue from a non-pi futex to a - * pi futex (pi to pi requeue is not supported) - * - * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire - * uaddr2 atomically on behalf of the top waiter. - * - * Return: - * - >=0 - on success, the number of tasks requeued or woken; - * - <0 - on error - */ -int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, - int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi) -{ - union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; - int task_count = 0, ret; - struct futex_pi_state *pi_state = NULL; - struct futex_hash_bucket *hb1, *hb2; - struct futex_q *this, *next; - DEFINE_WAKE_Q(wake_q); - - if (nr_wake < 0 || nr_requeue < 0) - return -EINVAL; - - /* - * When PI not supported: return -ENOSYS if requeue_pi is true, - * consequently the compiler knows requeue_pi is always false past - * this point which will optimize away all the conditional code - * further down. - */ - if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) - return -ENOSYS; - - if (requeue_pi) { - /* - * Requeue PI only works on two distinct uaddrs. This - * check is only valid for private futexes. See below. - */ - if (uaddr1 == uaddr2) - return -EINVAL; - - /* - * futex_requeue() allows the caller to define the number - * of waiters to wake up via the @nr_wake argument. With - * REQUEUE_PI, waking up more than one waiter is creating - * more problems than it solves. Waking up a waiter makes - * only sense if the PI futex @uaddr2 is uncontended as - * this allows the requeue code to acquire the futex - * @uaddr2 before waking the waiter. The waiter can then - * return to user space without further action. A secondary - * wakeup would just make the futex_wait_requeue_pi() - * handling more complex, because that code would have to - * look up pi_state and do more or less all the handling - * which the requeue code has to do for the to be requeued - * waiters. So restrict the number of waiters to wake to - * one, and only wake it up when the PI futex is - * uncontended. Otherwise requeue it and let the unlock of - * the PI futex handle the wakeup. - * - * All REQUEUE_PI users, e.g. pthread_cond_signal() and - * pthread_cond_broadcast() must use nr_wake=1. - */ - if (nr_wake != 1) - return -EINVAL; - - /* - * requeue_pi requires a pi_state, try to allocate it now - * without any locks in case it fails. - */ - if (refill_pi_state_cache()) - return -ENOMEM; - } - -retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); - if (unlikely(ret != 0)) - return ret; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, - requeue_pi ? FUTEX_WRITE : FUTEX_READ); - if (unlikely(ret != 0)) - return ret; - - /* - * The check above which compares uaddrs is not sufficient for - * shared futexes. We need to compare the keys: - */ - if (requeue_pi && futex_match(&key1, &key2)) - return -EINVAL; - - hb1 = futex_hash(&key1); - hb2 = futex_hash(&key2); - -retry_private: - futex_hb_waiters_inc(hb2); - double_lock_hb(hb1, hb2); - - if (likely(cmpval != NULL)) { - u32 curval; - - ret = futex_get_value_locked(&curval, uaddr1); - - if (unlikely(ret)) { - double_unlock_hb(hb1, hb2); - futex_hb_waiters_dec(hb2); - - ret = get_user(curval, uaddr1); - if (ret) - return ret; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - goto retry; - } - if (curval != *cmpval) { - ret = -EAGAIN; - goto out_unlock; - } - } - - if (requeue_pi) { - struct task_struct *exiting = NULL; - - /* - * Attempt to acquire uaddr2 and wake the top waiter. If we - * intend to requeue waiters, force setting the FUTEX_WAITERS - * bit. We force this here where we are able to easily handle - * faults rather in the requeue loop below. - * - * Updates topwaiter::requeue_state if a top waiter exists. - */ - ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, - &key2, &pi_state, - &exiting, nr_requeue); - - /* - * At this point the top_waiter has either taken uaddr2 or - * is waiting on it. In both cases pi_state has been - * established and an initial refcount on it. In case of an - * error there's nothing. - * - * The top waiter's requeue_state is up to date: - * - * - If the lock was acquired atomically (ret == 1), then - * the state is Q_REQUEUE_PI_LOCKED. - * - * The top waiter has been dequeued and woken up and can - * return to user space immediately. The kernel/user - * space state is consistent. In case that there must be - * more waiters requeued the WAITERS bit in the user - * space futex is set so the top waiter task has to go - * into the syscall slowpath to unlock the futex. This - * will block until this requeue operation has been - * completed and the hash bucket locks have been - * dropped. - * - * - If the trylock failed with an error (ret < 0) then - * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing - * happened", or Q_REQUEUE_PI_IGNORE when there was an - * interleaved early wakeup. - * - * - If the trylock did not succeed (ret == 0) then the - * state is either Q_REQUEUE_PI_IN_PROGRESS or - * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. - * This will be cleaned up in the loop below, which - * cannot fail because futex_proxy_trylock_atomic() did - * the same sanity checks for requeue_pi as the loop - * below does. - */ - switch (ret) { - case 0: - /* We hold a reference on the pi state. */ - break; - - case 1: - /* - * futex_proxy_trylock_atomic() acquired the user space - * futex. Adjust task_count. - */ - task_count++; - ret = 0; - break; - - /* - * If the above failed, then pi_state is NULL and - * waiter::requeue_state is correct. - */ - case -EFAULT: - double_unlock_hb(hb1, hb2); - futex_hb_waiters_dec(hb2); - ret = fault_in_user_writeable(uaddr2); - if (!ret) - goto retry; - return ret; - case -EBUSY: - case -EAGAIN: - /* - * Two reasons for this: - * - EBUSY: Owner is exiting and we just wait for the - * exit to complete. - * - EAGAIN: The user space value changed. - */ - double_unlock_hb(hb1, hb2); - futex_hb_waiters_dec(hb2); - /* - * Handle the case where the owner is in the middle of - * exiting. Wait for the exit to complete otherwise - * this task might loop forever, aka. live lock. - */ - wait_for_owner_exiting(ret, exiting); - cond_resched(); - goto retry; - default: - goto out_unlock; - } - } - - plist_for_each_entry_safe(this, next, &hb1->chain, list) { - if (task_count - nr_wake >= nr_requeue) - break; - - if (!futex_match(&this->key, &key1)) - continue; - - /* - * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always - * be paired with each other and no other futex ops. - * - * We should never be requeueing a futex_q with a pi_state, - * which is awaiting a futex_unlock_pi(). - */ - if ((requeue_pi && !this->rt_waiter) || - (!requeue_pi && this->rt_waiter) || - this->pi_state) { - ret = -EINVAL; - break; - } - - /* Plain futexes just wake or requeue and are done */ - if (!requeue_pi) { - if (++task_count <= nr_wake) - futex_wake_mark(&wake_q, this); - else - requeue_futex(this, hb1, hb2, &key2); - continue; - } - - /* Ensure we requeue to the expected futex for requeue_pi. */ - if (!futex_match(this->requeue_pi_key, &key2)) { - ret = -EINVAL; - break; - } - - /* - * Requeue nr_requeue waiters and possibly one more in the case - * of requeue_pi if we couldn't acquire the lock atomically. - * - * Prepare the waiter to take the rt_mutex. Take a refcount - * on the pi_state and store the pointer in the futex_q - * object of the waiter. - */ - get_pi_state(pi_state); - - /* Don't requeue when the waiter is already on the way out. */ - if (!futex_requeue_pi_prepare(this, pi_state)) { - /* - * Early woken waiter signaled that it is on the - * way out. Drop the pi_state reference and try the - * next waiter. @this->pi_state is still NULL. - */ - put_pi_state(pi_state); - continue; - } - - ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, - this->rt_waiter, - this->task); - - if (ret == 1) { - /* - * We got the lock. We do neither drop the refcount - * on pi_state nor clear this->pi_state because the - * waiter needs the pi_state for cleaning up the - * user space value. It will drop the refcount - * after doing so. this::requeue_state is updated - * in the wakeup as well. - */ - requeue_pi_wake_futex(this, &key2, hb2); - task_count++; - } else if (!ret) { - /* Waiter is queued, move it to hb2 */ - requeue_futex(this, hb1, hb2, &key2); - futex_requeue_pi_complete(this, 0); - task_count++; - } else { - /* - * rt_mutex_start_proxy_lock() detected a potential - * deadlock when we tried to queue that waiter. - * Drop the pi_state reference which we took above - * and remove the pointer to the state from the - * waiters futex_q object. - */ - this->pi_state = NULL; - put_pi_state(pi_state); - futex_requeue_pi_complete(this, ret); - /* - * We stop queueing more waiters and let user space - * deal with the mess. - */ - break; - } - } - - /* - * We took an extra initial reference to the pi_state in - * futex_proxy_trylock_atomic(). We need to drop it here again. - */ - put_pi_state(pi_state); - -out_unlock: - double_unlock_hb(hb1, hb2); - wake_up_q(&wake_q); - futex_hb_waiters_dec(hb2); - return ret ? ret : task_count; -} - /* The key must be already stored in q->key. */ struct futex_hash_bucket *futex_q_lock(struct futex_q *q) __acquires(&hb->lock) @@ -1718,8 +983,8 @@ static long futex_wait_restart(struct restart_block *restart); * @q: the futex_q to queue up on * @timeout: the prepared hrtimer_sleeper, or null for no timeout */ -static void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, - struct hrtimer_sleeper *timeout) +void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, + struct hrtimer_sleeper *timeout) { /* * The task state is guaranteed to be set before another task can @@ -1766,8 +1031,8 @@ static void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * - 0 - uaddr contains val and hb has been locked; * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ -static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - struct futex_q *q, struct futex_hash_bucket **hb) +int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + struct futex_q *q, struct futex_hash_bucket **hb) { u32 uval; int ret; @@ -1900,225 +1165,6 @@ static long futex_wait_restart(struct restart_block *restart) } -/** - * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex - * @hb: the hash_bucket futex_q was original enqueued on - * @q: the futex_q woken while waiting to be requeued - * @timeout: the timeout associated with the wait (NULL if none) - * - * Determine the cause for the early wakeup. - * - * Return: - * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR - */ -static inline -int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, - struct futex_q *q, - struct hrtimer_sleeper *timeout) -{ - int ret; - - /* - * With the hb lock held, we avoid races while we process the wakeup. - * We only need to hold hb (and not hb2) to ensure atomicity as the - * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. - * It can't be requeued from uaddr2 to something else since we don't - * support a PI aware source futex for requeue. - */ - WARN_ON_ONCE(&hb->lock != q->lock_ptr); - - /* - * We were woken prior to requeue by a timeout or a signal. - * Unqueue the futex_q and determine which it was. - */ - plist_del(&q->list, &hb->chain); - futex_hb_waiters_dec(hb); - - /* Handle spurious wakeups gracefully */ - ret = -EWOULDBLOCK; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - else if (signal_pending(current)) - ret = -ERESTARTNOINTR; - return ret; -} - -/** - * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 - * @uaddr: the futex we initially wait on (non-pi) - * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be - * the same type, no requeueing from private to shared, etc. - * @val: the expected value of uaddr - * @abs_time: absolute timeout - * @bitset: 32 bit wakeup bitset set by userspace, defaults to all - * @uaddr2: the pi futex we will take prior to returning to user-space - * - * The caller will wait on uaddr and will be requeued by futex_requeue() to - * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake - * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to - * userspace. This ensures the rt_mutex maintains an owner when it has waiters; - * without one, the pi logic would not know which task to boost/deboost, if - * there was a need to. - * - * We call schedule in futex_wait_queue() when we enqueue and return there - * via the following-- - * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() - * 2) wakeup on uaddr2 after a requeue - * 3) signal - * 4) timeout - * - * If 3, cleanup and return -ERESTARTNOINTR. - * - * If 2, we may then block on trying to take the rt_mutex and return via: - * 5) successful lock - * 6) signal - * 7) timeout - * 8) other lock acquisition failure - * - * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). - * - * If 4 or 7, we cleanup and return with -ETIMEDOUT. - * - * Return: - * - 0 - On success; - * - <0 - On error - */ -int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, - u32 val, ktime_t *abs_time, u32 bitset, - u32 __user *uaddr2) -{ - struct hrtimer_sleeper timeout, *to; - struct rt_mutex_waiter rt_waiter; - struct futex_hash_bucket *hb; - union futex_key key2 = FUTEX_KEY_INIT; - struct futex_q q = futex_q_init; - struct rt_mutex_base *pi_mutex; - int res, ret; - - if (!IS_ENABLED(CONFIG_FUTEX_PI)) - return -ENOSYS; - - if (uaddr == uaddr2) - return -EINVAL; - - if (!bitset) - return -EINVAL; - - to = futex_setup_timer(abs_time, &timeout, flags, - current->timer_slack_ns); - - /* - * The waiter is allocated on our stack, manipulated by the requeue - * code while we sleep on uaddr. - */ - rt_mutex_init_waiter(&rt_waiter); - - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); - if (unlikely(ret != 0)) - goto out; - - q.bitset = bitset; - q.rt_waiter = &rt_waiter; - q.requeue_pi_key = &key2; - - /* - * Prepare to wait on uaddr. On success, it holds hb->lock and q - * is initialized. - */ - ret = futex_wait_setup(uaddr, val, flags, &q, &hb); - if (ret) - goto out; - - /* - * The check above which compares uaddrs is not sufficient for - * shared futexes. We need to compare the keys: - */ - if (futex_match(&q.key, &key2)) { - futex_q_unlock(hb); - ret = -EINVAL; - goto out; - } - - /* Queue the futex_q, drop the hb lock, wait for wakeup. */ - futex_wait_queue(hb, &q, to); - - switch (futex_requeue_pi_wakeup_sync(&q)) { - case Q_REQUEUE_PI_IGNORE: - /* The waiter is still on uaddr1 */ - spin_lock(&hb->lock); - ret = handle_early_requeue_pi_wakeup(hb, &q, to); - spin_unlock(&hb->lock); - break; - - case Q_REQUEUE_PI_LOCKED: - /* The requeue acquired the lock */ - if (q.pi_state && (q.pi_state->owner != current)) { - spin_lock(q.lock_ptr); - ret = fixup_pi_owner(uaddr2, &q, true); - /* - * Drop the reference to the pi state which the - * requeue_pi() code acquired for us. - */ - put_pi_state(q.pi_state); - spin_unlock(q.lock_ptr); - /* - * Adjust the return value. It's either -EFAULT or - * success (1) but the caller expects 0 for success. - */ - ret = ret < 0 ? ret : 0; - } - break; - - case Q_REQUEUE_PI_DONE: - /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */ - pi_mutex = &q.pi_state->pi_mutex; - ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); - - /* Current is not longer pi_blocked_on */ - spin_lock(q.lock_ptr); - if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) - ret = 0; - - debug_rt_mutex_free_waiter(&rt_waiter); - /* - * Fixup the pi_state owner and possibly acquire the lock if we - * haven't already. - */ - res = fixup_pi_owner(uaddr2, &q, !ret); - /* - * If fixup_pi_owner() returned an error, propagate that. If it - * acquired the lock, clear -ETIMEDOUT or -EINTR. - */ - if (res) - ret = (res < 0) ? res : 0; - - futex_unqueue_pi(&q); - spin_unlock(q.lock_ptr); - - if (ret == -EINTR) { - /* - * We've already been requeued, but cannot restart - * by calling futex_lock_pi() directly. We could - * restart this syscall, but it would detect that - * the user space "val" changed and return - * -EWOULDBLOCK. Save the overhead of the restart - * and return -EWOULDBLOCK directly. - */ - ret = -EWOULDBLOCK; - } - break; - default: - BUG(); - } - -out: - if (to) { - hrtimer_cancel(&to->timer); - destroy_hrtimer_on_stack(&to->timer); - } - return ret; -} - /* Constants for the pending_op argument of handle_futex_death */ #define HANDLE_DEATH_PENDING true #define HANDLE_DEATH_LIST false diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 4969e962ebee..840302aefdfc 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -3,6 +3,8 @@ #define _FUTEX_H #include +#include + #include /* @@ -118,22 +120,69 @@ enum futex_access { extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, enum futex_access rw); -extern struct futex_hash_bucket *futex_hash(union futex_key *key); - extern struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns); +extern struct futex_hash_bucket *futex_hash(union futex_key *key); + +/** + * futex_match - Check whether two futex keys are equal + * @key1: Pointer to key1 + * @key2: Pointer to key2 + * + * Return 1 if two futex_keys are equal, 0 otherwise. + */ +static inline int futex_match(union futex_key *key1, union futex_key *key2) +{ + return (key1 && key2 + && key1->both.word == key2->both.word + && key1->both.ptr == key2->both.ptr + && key1->both.offset == key2->both.offset); +} + +extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + struct futex_q *q, struct futex_hash_bucket **hb); +extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, + struct hrtimer_sleeper *timeout); +extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); + extern int fault_in_user_writeable(u32 __user *uaddr); extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval); extern int futex_get_value_locked(u32 *dest, u32 __user *from); extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key); +extern void __futex_unqueue(struct futex_q *q); extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb); extern void futex_unqueue_pi(struct futex_q *q); extern void wait_for_owner_exiting(int ret, struct task_struct *exiting); +/* + * Reflects a new waiter being added to the waitqueue. + */ +static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + atomic_inc(&hb->waiters); + /* + * Full barrier (A), see the ordering comment above. + */ + smp_mb__after_atomic(); +#endif +} + +/* + * Reflects a waiter being removed from the waitqueue by wakeup + * paths. + */ +static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + atomic_dec(&hb->waiters); +#endif +} + extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q); extern void futex_q_unlock(struct futex_hash_bucket *hb); @@ -150,6 +199,30 @@ extern void get_pi_state(struct futex_pi_state *pi_state); extern void put_pi_state(struct futex_pi_state *pi_state); extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked); +/* + * Express the locking dependencies for lockdep: + */ +static inline void +double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) +{ + if (hb1 <= hb2) { + spin_lock(&hb1->lock); + if (hb1 < hb2) + spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); + } else { /* hb1 > hb2 */ + spin_lock(&hb2->lock); + spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); + } +} + +static inline void +double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) +{ + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); +} + /* syscalls */ extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c new file mode 100644 index 000000000000..cba8b1a6a4cc --- /dev/null +++ b/kernel/futex/requeue.c @@ -0,0 +1,897 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +#include "futex.h" +#include "../locking/rtmutex_common.h" + +/* + * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an + * underlying rtmutex. The task which is about to be requeued could have + * just woken up (timeout, signal). After the wake up the task has to + * acquire hash bucket lock, which is held by the requeue code. As a task + * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking + * and the hash bucket lock blocking would collide and corrupt state. + * + * On !PREEMPT_RT this is not a problem and everything could be serialized + * on hash bucket lock, but aside of having the benefit of common code, + * this allows to avoid doing the requeue when the task is already on the + * way out and taking the hash bucket lock of the original uaddr1 when the + * requeue has been completed. + * + * The following state transitions are valid: + * + * On the waiter side: + * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE + * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT + * + * On the requeue side: + * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS + * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED + * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed) + * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED + * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed) + * + * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this + * signals that the waiter is already on the way out. It also means that + * the waiter is still on the 'wait' futex, i.e. uaddr1. + * + * The waiter side signals early wakeup to the requeue side either through + * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending + * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately + * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT, + * which means the wakeup is interleaving with a requeue in progress it has + * to wait for the requeue side to change the state. Either to DONE/LOCKED + * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex + * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by + * the requeue side when the requeue attempt failed via deadlock detection + * and therefore the waiter q is still on the uaddr1 futex. + */ +enum { + Q_REQUEUE_PI_NONE = 0, + Q_REQUEUE_PI_IGNORE, + Q_REQUEUE_PI_IN_PROGRESS, + Q_REQUEUE_PI_WAIT, + Q_REQUEUE_PI_DONE, + Q_REQUEUE_PI_LOCKED, +}; + +const struct futex_q futex_q_init = { + /* list gets initialized in futex_queue()*/ + .key = FUTEX_KEY_INIT, + .bitset = FUTEX_BITSET_MATCH_ANY, + .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE), +}; + +/** + * requeue_futex() - Requeue a futex_q from one hb to another + * @q: the futex_q to requeue + * @hb1: the source hash_bucket + * @hb2: the target hash_bucket + * @key2: the new key for the requeued futex_q + */ +static inline +void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key2) +{ + + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if (likely(&hb1->chain != &hb2->chain)) { + plist_del(&q->list, &hb1->chain); + futex_hb_waiters_dec(hb1); + futex_hb_waiters_inc(hb2); + plist_add(&q->list, &hb2->chain); + q->lock_ptr = &hb2->lock; + } + q->key = *key2; +} + +static inline bool futex_requeue_pi_prepare(struct futex_q *q, + struct futex_pi_state *pi_state) +{ + int old, new; + + /* + * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has + * already set Q_REQUEUE_PI_IGNORE to signal that requeue should + * ignore the waiter. + */ + old = atomic_read_acquire(&q->requeue_state); + do { + if (old == Q_REQUEUE_PI_IGNORE) + return false; + + /* + * futex_proxy_trylock_atomic() might have set it to + * IN_PROGRESS and a interleaved early wake to WAIT. + * + * It was considered to have an extra state for that + * trylock, but that would just add more conditionals + * all over the place for a dubious value. + */ + if (old != Q_REQUEUE_PI_NONE) + break; + + new = Q_REQUEUE_PI_IN_PROGRESS; + } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); + + q->pi_state = pi_state; + return true; +} + +static inline void futex_requeue_pi_complete(struct futex_q *q, int locked) +{ + int old, new; + + old = atomic_read_acquire(&q->requeue_state); + do { + if (old == Q_REQUEUE_PI_IGNORE) + return; + + if (locked >= 0) { + /* Requeue succeeded. Set DONE or LOCKED */ + WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS && + old != Q_REQUEUE_PI_WAIT); + new = Q_REQUEUE_PI_DONE + locked; + } else if (old == Q_REQUEUE_PI_IN_PROGRESS) { + /* Deadlock, no early wakeup interleave */ + new = Q_REQUEUE_PI_NONE; + } else { + /* Deadlock, early wakeup interleave. */ + WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT); + new = Q_REQUEUE_PI_IGNORE; + } + } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); + +#ifdef CONFIG_PREEMPT_RT + /* If the waiter interleaved with the requeue let it know */ + if (unlikely(old == Q_REQUEUE_PI_WAIT)) + rcuwait_wake_up(&q->requeue_wait); +#endif +} + +static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) +{ + int old, new; + + old = atomic_read_acquire(&q->requeue_state); + do { + /* Is requeue done already? */ + if (old >= Q_REQUEUE_PI_DONE) + return old; + + /* + * If not done, then tell the requeue code to either ignore + * the waiter or to wake it up once the requeue is done. + */ + new = Q_REQUEUE_PI_WAIT; + if (old == Q_REQUEUE_PI_NONE) + new = Q_REQUEUE_PI_IGNORE; + } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); + + /* If the requeue was in progress, wait for it to complete */ + if (old == Q_REQUEUE_PI_IN_PROGRESS) { +#ifdef CONFIG_PREEMPT_RT + rcuwait_wait_event(&q->requeue_wait, + atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT, + TASK_UNINTERRUPTIBLE); +#else + (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT); +#endif + } + + /* + * Requeue is now either prohibited or complete. Reread state + * because during the wait above it might have changed. Nothing + * will modify q->requeue_state after this point. + */ + return atomic_read(&q->requeue_state); +} + +/** + * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue + * @q: the futex_q + * @key: the key of the requeue target futex + * @hb: the hash_bucket of the requeue target futex + * + * During futex_requeue, with requeue_pi=1, it is possible to acquire the + * target futex if it is uncontended or via a lock steal. + * + * 1) Set @q::key to the requeue target futex key so the waiter can detect + * the wakeup on the right futex. + * + * 2) Dequeue @q from the hash bucket. + * + * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock + * acquisition. + * + * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that + * the waiter has to fixup the pi state. + * + * 5) Complete the requeue state so the waiter can make progress. After + * this point the waiter task can return from the syscall immediately in + * case that the pi state does not have to be fixed up. + * + * 6) Wake the waiter task. + * + * Must be called with both q->lock_ptr and hb->lock held. + */ +static inline +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, + struct futex_hash_bucket *hb) +{ + q->key = *key; + + __futex_unqueue(q); + + WARN_ON(!q->rt_waiter); + q->rt_waiter = NULL; + + q->lock_ptr = &hb->lock; + + /* Signal locked state to the waiter */ + futex_requeue_pi_complete(q, 1); + wake_up_state(q->task, TASK_NORMAL); +} + +/** + * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter + * @pifutex: the user address of the to futex + * @hb1: the from futex hash bucket, must be locked by the caller + * @hb2: the to futex hash bucket, must be locked by the caller + * @key1: the from futex key + * @key2: the to futex key + * @ps: address to store the pi_state pointer + * @exiting: Pointer to store the task pointer of the owner task + * which is in the middle of exiting + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Try and get the lock on behalf of the top waiter if we can do it atomically. + * Wake the top waiter if we succeed. If the caller specified set_waiters, + * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. + * hb1 and hb2 must be held by the caller. + * + * @exiting is only set when the return value is -EBUSY. If so, this holds + * a refcount on the exiting task on return and the caller needs to drop it + * after waiting for the exit to complete. + * + * Return: + * - 0 - failed to acquire the lock atomically; + * - >0 - acquired the lock, return value is vpid of the top_waiter + * - <0 - error + */ +static int +futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key1, + union futex_key *key2, struct futex_pi_state **ps, + struct task_struct **exiting, int set_waiters) +{ + struct futex_q *top_waiter = NULL; + u32 curval; + int ret; + + if (futex_get_value_locked(&curval, pifutex)) + return -EFAULT; + + if (unlikely(should_fail_futex(true))) + return -EFAULT; + + /* + * Find the top_waiter and determine if there are additional waiters. + * If the caller intends to requeue more than 1 waiter to pifutex, + * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, + * as we have means to handle the possible fault. If not, don't set + * the bit unnecessarily as it will force the subsequent unlock to enter + * the kernel. + */ + top_waiter = futex_top_waiter(hb1, key1); + + /* There are no waiters, nothing for us to do. */ + if (!top_waiter) + return 0; + + /* + * Ensure that this is a waiter sitting in futex_wait_requeue_pi() + * and waiting on the 'waitqueue' futex which is always !PI. + */ + if (!top_waiter->rt_waiter || top_waiter->pi_state) + return -EINVAL; + + /* Ensure we requeue to the expected futex. */ + if (!futex_match(top_waiter->requeue_pi_key, key2)) + return -EINVAL; + + /* Ensure that this does not race against an early wakeup */ + if (!futex_requeue_pi_prepare(top_waiter, NULL)) + return -EAGAIN; + + /* + * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit + * in the contended case or if @set_waiters is true. + * + * In the contended case PI state is attached to the lock owner. If + * the user space lock can be acquired then PI state is attached to + * the new owner (@top_waiter->task) when @set_waiters is true. + */ + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, + exiting, set_waiters); + if (ret == 1) { + /* + * Lock was acquired in user space and PI state was + * attached to @top_waiter->task. That means state is fully + * consistent and the waiter can return to user space + * immediately after the wakeup. + */ + requeue_pi_wake_futex(top_waiter, key2, hb2); + } else if (ret < 0) { + /* Rewind top_waiter::requeue_state */ + futex_requeue_pi_complete(top_waiter, ret); + } else { + /* + * futex_lock_pi_atomic() did not acquire the user space + * futex, but managed to establish the proxy lock and pi + * state. top_waiter::requeue_state cannot be fixed up here + * because the waiter is not enqueued on the rtmutex + * yet. This is handled at the callsite depending on the + * result of rt_mutex_start_proxy_lock() which is + * guaranteed to be reached with this function returning 0. + */ + } + return ret; +} + +/** + * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 + * @uaddr1: source futex user address + * @flags: futex flags (FLAGS_SHARED, etc.) + * @uaddr2: target futex user address + * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * @nr_requeue: number of waiters to requeue (0-INT_MAX) + * @cmpval: @uaddr1 expected value (or %NULL) + * @requeue_pi: if we are attempting to requeue from a non-pi futex to a + * pi futex (pi to pi requeue is not supported) + * + * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire + * uaddr2 atomically on behalf of the top waiter. + * + * Return: + * - >=0 - on success, the number of tasks requeued or woken; + * - <0 - on error + */ +int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, + int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi) +{ + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + int task_count = 0, ret; + struct futex_pi_state *pi_state = NULL; + struct futex_hash_bucket *hb1, *hb2; + struct futex_q *this, *next; + DEFINE_WAKE_Q(wake_q); + + if (nr_wake < 0 || nr_requeue < 0) + return -EINVAL; + + /* + * When PI not supported: return -ENOSYS if requeue_pi is true, + * consequently the compiler knows requeue_pi is always false past + * this point which will optimize away all the conditional code + * further down. + */ + if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) + return -ENOSYS; + + if (requeue_pi) { + /* + * Requeue PI only works on two distinct uaddrs. This + * check is only valid for private futexes. See below. + */ + if (uaddr1 == uaddr2) + return -EINVAL; + + /* + * futex_requeue() allows the caller to define the number + * of waiters to wake up via the @nr_wake argument. With + * REQUEUE_PI, waking up more than one waiter is creating + * more problems than it solves. Waking up a waiter makes + * only sense if the PI futex @uaddr2 is uncontended as + * this allows the requeue code to acquire the futex + * @uaddr2 before waking the waiter. The waiter can then + * return to user space without further action. A secondary + * wakeup would just make the futex_wait_requeue_pi() + * handling more complex, because that code would have to + * look up pi_state and do more or less all the handling + * which the requeue code has to do for the to be requeued + * waiters. So restrict the number of waiters to wake to + * one, and only wake it up when the PI futex is + * uncontended. Otherwise requeue it and let the unlock of + * the PI futex handle the wakeup. + * + * All REQUEUE_PI users, e.g. pthread_cond_signal() and + * pthread_cond_broadcast() must use nr_wake=1. + */ + if (nr_wake != 1) + return -EINVAL; + + /* + * requeue_pi requires a pi_state, try to allocate it now + * without any locks in case it fails. + */ + if (refill_pi_state_cache()) + return -ENOMEM; + } + +retry: + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); + if (unlikely(ret != 0)) + return ret; + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, + requeue_pi ? FUTEX_WRITE : FUTEX_READ); + if (unlikely(ret != 0)) + return ret; + + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ + if (requeue_pi && futex_match(&key1, &key2)) + return -EINVAL; + + hb1 = futex_hash(&key1); + hb2 = futex_hash(&key2); + +retry_private: + futex_hb_waiters_inc(hb2); + double_lock_hb(hb1, hb2); + + if (likely(cmpval != NULL)) { + u32 curval; + + ret = futex_get_value_locked(&curval, uaddr1); + + if (unlikely(ret)) { + double_unlock_hb(hb1, hb2); + futex_hb_waiters_dec(hb2); + + ret = get_user(curval, uaddr1); + if (ret) + return ret; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + goto retry; + } + if (curval != *cmpval) { + ret = -EAGAIN; + goto out_unlock; + } + } + + if (requeue_pi) { + struct task_struct *exiting = NULL; + + /* + * Attempt to acquire uaddr2 and wake the top waiter. If we + * intend to requeue waiters, force setting the FUTEX_WAITERS + * bit. We force this here where we are able to easily handle + * faults rather in the requeue loop below. + * + * Updates topwaiter::requeue_state if a top waiter exists. + */ + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, + &key2, &pi_state, + &exiting, nr_requeue); + + /* + * At this point the top_waiter has either taken uaddr2 or + * is waiting on it. In both cases pi_state has been + * established and an initial refcount on it. In case of an + * error there's nothing. + * + * The top waiter's requeue_state is up to date: + * + * - If the lock was acquired atomically (ret == 1), then + * the state is Q_REQUEUE_PI_LOCKED. + * + * The top waiter has been dequeued and woken up and can + * return to user space immediately. The kernel/user + * space state is consistent. In case that there must be + * more waiters requeued the WAITERS bit in the user + * space futex is set so the top waiter task has to go + * into the syscall slowpath to unlock the futex. This + * will block until this requeue operation has been + * completed and the hash bucket locks have been + * dropped. + * + * - If the trylock failed with an error (ret < 0) then + * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing + * happened", or Q_REQUEUE_PI_IGNORE when there was an + * interleaved early wakeup. + * + * - If the trylock did not succeed (ret == 0) then the + * state is either Q_REQUEUE_PI_IN_PROGRESS or + * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. + * This will be cleaned up in the loop below, which + * cannot fail because futex_proxy_trylock_atomic() did + * the same sanity checks for requeue_pi as the loop + * below does. + */ + switch (ret) { + case 0: + /* We hold a reference on the pi state. */ + break; + + case 1: + /* + * futex_proxy_trylock_atomic() acquired the user space + * futex. Adjust task_count. + */ + task_count++; + ret = 0; + break; + + /* + * If the above failed, then pi_state is NULL and + * waiter::requeue_state is correct. + */ + case -EFAULT: + double_unlock_hb(hb1, hb2); + futex_hb_waiters_dec(hb2); + ret = fault_in_user_writeable(uaddr2); + if (!ret) + goto retry; + return ret; + case -EBUSY: + case -EAGAIN: + /* + * Two reasons for this: + * - EBUSY: Owner is exiting and we just wait for the + * exit to complete. + * - EAGAIN: The user space value changed. + */ + double_unlock_hb(hb1, hb2); + futex_hb_waiters_dec(hb2); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); + cond_resched(); + goto retry; + default: + goto out_unlock; + } + } + + plist_for_each_entry_safe(this, next, &hb1->chain, list) { + if (task_count - nr_wake >= nr_requeue) + break; + + if (!futex_match(&this->key, &key1)) + continue; + + /* + * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * be paired with each other and no other futex ops. + * + * We should never be requeueing a futex_q with a pi_state, + * which is awaiting a futex_unlock_pi(). + */ + if ((requeue_pi && !this->rt_waiter) || + (!requeue_pi && this->rt_waiter) || + this->pi_state) { + ret = -EINVAL; + break; + } + + /* Plain futexes just wake or requeue and are done */ + if (!requeue_pi) { + if (++task_count <= nr_wake) + futex_wake_mark(&wake_q, this); + else + requeue_futex(this, hb1, hb2, &key2); + continue; + } + + /* Ensure we requeue to the expected futex for requeue_pi. */ + if (!futex_match(this->requeue_pi_key, &key2)) { + ret = -EINVAL; + break; + } + + /* + * Requeue nr_requeue waiters and possibly one more in the case + * of requeue_pi if we couldn't acquire the lock atomically. + * + * Prepare the waiter to take the rt_mutex. Take a refcount + * on the pi_state and store the pointer in the futex_q + * object of the waiter. + */ + get_pi_state(pi_state); + + /* Don't requeue when the waiter is already on the way out. */ + if (!futex_requeue_pi_prepare(this, pi_state)) { + /* + * Early woken waiter signaled that it is on the + * way out. Drop the pi_state reference and try the + * next waiter. @this->pi_state is still NULL. + */ + put_pi_state(pi_state); + continue; + } + + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task); + + if (ret == 1) { + /* + * We got the lock. We do neither drop the refcount + * on pi_state nor clear this->pi_state because the + * waiter needs the pi_state for cleaning up the + * user space value. It will drop the refcount + * after doing so. this::requeue_state is updated + * in the wakeup as well. + */ + requeue_pi_wake_futex(this, &key2, hb2); + task_count++; + } else if (!ret) { + /* Waiter is queued, move it to hb2 */ + requeue_futex(this, hb1, hb2, &key2); + futex_requeue_pi_complete(this, 0); + task_count++; + } else { + /* + * rt_mutex_start_proxy_lock() detected a potential + * deadlock when we tried to queue that waiter. + * Drop the pi_state reference which we took above + * and remove the pointer to the state from the + * waiters futex_q object. + */ + this->pi_state = NULL; + put_pi_state(pi_state); + futex_requeue_pi_complete(this, ret); + /* + * We stop queueing more waiters and let user space + * deal with the mess. + */ + break; + } + } + + /* + * We took an extra initial reference to the pi_state in + * futex_proxy_trylock_atomic(). We need to drop it here again. + */ + put_pi_state(pi_state); + +out_unlock: + double_unlock_hb(hb1, hb2); + wake_up_q(&wake_q); + futex_hb_waiters_dec(hb2); + return ret ? ret : task_count; +} + +/** + * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex + * @hb: the hash_bucket futex_q was original enqueued on + * @q: the futex_q woken while waiting to be requeued + * @timeout: the timeout associated with the wait (NULL if none) + * + * Determine the cause for the early wakeup. + * + * Return: + * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR + */ +static inline +int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, + struct futex_q *q, + struct hrtimer_sleeper *timeout) +{ + int ret; + + /* + * With the hb lock held, we avoid races while we process the wakeup. + * We only need to hold hb (and not hb2) to ensure atomicity as the + * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. + * It can't be requeued from uaddr2 to something else since we don't + * support a PI aware source futex for requeue. + */ + WARN_ON_ONCE(&hb->lock != q->lock_ptr); + + /* + * We were woken prior to requeue by a timeout or a signal. + * Unqueue the futex_q and determine which it was. + */ + plist_del(&q->list, &hb->chain); + futex_hb_waiters_dec(hb); + + /* Handle spurious wakeups gracefully */ + ret = -EWOULDBLOCK; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + else if (signal_pending(current)) + ret = -ERESTARTNOINTR; + return ret; +} + +/** + * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 + * @uaddr: the futex we initially wait on (non-pi) + * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be + * the same type, no requeueing from private to shared, etc. + * @val: the expected value of uaddr + * @abs_time: absolute timeout + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all + * @uaddr2: the pi futex we will take prior to returning to user-space + * + * The caller will wait on uaddr and will be requeued by futex_requeue() to + * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake + * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to + * userspace. This ensures the rt_mutex maintains an owner when it has waiters; + * without one, the pi logic would not know which task to boost/deboost, if + * there was a need to. + * + * We call schedule in futex_wait_queue() when we enqueue and return there + * via the following-- + * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() + * 2) wakeup on uaddr2 after a requeue + * 3) signal + * 4) timeout + * + * If 3, cleanup and return -ERESTARTNOINTR. + * + * If 2, we may then block on trying to take the rt_mutex and return via: + * 5) successful lock + * 6) signal + * 7) timeout + * 8) other lock acquisition failure + * + * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). + * + * If 4 or 7, we cleanup and return with -ETIMEDOUT. + * + * Return: + * - 0 - On success; + * - <0 - On error + */ +int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + u32 val, ktime_t *abs_time, u32 bitset, + u32 __user *uaddr2) +{ + struct hrtimer_sleeper timeout, *to; + struct rt_mutex_waiter rt_waiter; + struct futex_hash_bucket *hb; + union futex_key key2 = FUTEX_KEY_INIT; + struct futex_q q = futex_q_init; + struct rt_mutex_base *pi_mutex; + int res, ret; + + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + + if (uaddr == uaddr2) + return -EINVAL; + + if (!bitset) + return -EINVAL; + + to = futex_setup_timer(abs_time, &timeout, flags, + current->timer_slack_ns); + + /* + * The waiter is allocated on our stack, manipulated by the requeue + * code while we sleep on uaddr. + */ + rt_mutex_init_waiter(&rt_waiter); + + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); + if (unlikely(ret != 0)) + goto out; + + q.bitset = bitset; + q.rt_waiter = &rt_waiter; + q.requeue_pi_key = &key2; + + /* + * Prepare to wait on uaddr. On success, it holds hb->lock and q + * is initialized. + */ + ret = futex_wait_setup(uaddr, val, flags, &q, &hb); + if (ret) + goto out; + + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ + if (futex_match(&q.key, &key2)) { + futex_q_unlock(hb); + ret = -EINVAL; + goto out; + } + + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ + futex_wait_queue(hb, &q, to); + + switch (futex_requeue_pi_wakeup_sync(&q)) { + case Q_REQUEUE_PI_IGNORE: + /* The waiter is still on uaddr1 */ + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, to); + spin_unlock(&hb->lock); + break; + + case Q_REQUEUE_PI_LOCKED: + /* The requeue acquired the lock */ + if (q.pi_state && (q.pi_state->owner != current)) { + spin_lock(q.lock_ptr); + ret = fixup_pi_owner(uaddr2, &q, true); + /* + * Drop the reference to the pi state which the + * requeue_pi() code acquired for us. + */ + put_pi_state(q.pi_state); + spin_unlock(q.lock_ptr); + /* + * Adjust the return value. It's either -EFAULT or + * success (1) but the caller expects 0 for success. + */ + ret = ret < 0 ? ret : 0; + } + break; + + case Q_REQUEUE_PI_DONE: + /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */ + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); + + /* Current is not longer pi_blocked_on */ + spin_lock(q.lock_ptr); + if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) + ret = 0; + + debug_rt_mutex_free_waiter(&rt_waiter); + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_pi_owner(uaddr2, &q, !ret); + /* + * If fixup_pi_owner() returned an error, propagate that. If it + * acquired the lock, clear -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + futex_unqueue_pi(&q); + spin_unlock(q.lock_ptr); + + if (ret == -EINTR) { + /* + * We've already been requeued, but cannot restart + * by calling futex_lock_pi() directly. We could + * restart this syscall, but it would detect that + * the user space "val" changed and return + * -EWOULDBLOCK. Save the overhead of the restart + * and return -EWOULDBLOCK directly. + */ + ret = -EWOULDBLOCK; + } + break; + default: + BUG(); + } + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret; +} +