OpenCloudOS-Kernel/kernel/locking/qspinlock_cna.h

450 lines
12 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _GEN_CNA_LOCK_SLOWPATH
#error "do not include this file"
#endif
#include <linux/topology.h>
#include <linux/sched/clock.h>
#include <linux/sched/rt.h>
#include <linux/moduleparam.h>
#include <linux/random.h>
/*
* Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock).
*
* In CNA, spinning threads are organized in two queues, a primary queue for
* threads running on the same NUMA node as the current lock holder, and a
* secondary queue for threads running on other nodes. Schematically, it
* looks like this:
*
* cna_node
* +----------+ +--------+ +--------+
* |mcs:next | --> |mcs:next| --> ... |mcs:next| --> NULL [Primary queue]
* |mcs:locked| -. +--------+ +--------+
* +----------+ |
* `----------------------.
* v
* +--------+ +--------+
* |mcs:next| --> ... |mcs:next| [Secondary queue]
* +--------+ +--------+
* ^ |
* `--------------------'
*
* N.B. locked := 1 if secondary queue is absent. Otherwise, it contains the
* encoded pointer to the tail of the secondary queue, which is organized as a
* circular list.
*
* After acquiring the MCS lock and before acquiring the spinlock, the MCS lock
* holder checks whether the next waiter in the primary queue (if exists) is
* running on the same NUMA node. If it is not, that waiter is detached from the
* main queue and moved into the tail of the secondary queue. This way, we
* gradually filter the primary queue, leaving only waiters running on the same
* preferred NUMA node.
*
* For more details, see https://arxiv.org/abs/1810.05600.
*
* Authors: Alex Kogan <alex.kogan@oracle.com>
* Dave Dice <dave.dice@oracle.com>
*/
#define FLUSH_SECONDARY_QUEUE 1
#define CNA_PRIORITY_NODE 0xffff
#define DEFAULT_LLC_ID 0xfffe
struct cna_node {
struct mcs_spinlock mcs;
u16 llc_id;
u16 real_llc_id;
u32 encoded_tail; /* self */
u64 start_time;
};
bool use_llc_spinlock = false;
static ulong numa_spinlock_threshold_ns = 1000000; /* 1ms, by default */
module_param(numa_spinlock_threshold_ns, ulong, 0644);
static inline bool intra_node_threshold_reached(struct cna_node *cn)
{
u64 current_time = local_clock();
u64 threshold = cn->start_time + numa_spinlock_threshold_ns;
return current_time > threshold;
}
/*
* Controls the probability for enabling the ordering of the main queue
* when the secondary queue is empty. The chosen value reduces the amount
* of unnecessary shuffling of threads between the two waiting queues
* when the contention is low, while responding fast enough and enabling
* the shuffling when the contention is high.
*/
#define SHUFFLE_REDUCTION_PROB_ARG (7)
/* Per-CPU pseudo-random number seed */
static DEFINE_PER_CPU(u32, seed);
/*
* Return false with probability 1 / 2^@num_bits.
* Intuitively, the larger @num_bits the less likely false is to be returned.
* @num_bits must be a number between 0 and 31.
*/
static bool probably(unsigned int num_bits)
{
u32 s;
s = this_cpu_read(seed);
s = next_pseudo_random32(s);
this_cpu_write(seed, s);
return s & ((1 << num_bits) - 1);
}
static void __init cna_init_nodes_per_cpu(unsigned int cpu)
{
struct mcs_spinlock *base = per_cpu_ptr(&qnodes[0].mcs, cpu);
int i;
for (i = 0; i < MAX_NODES; i++) {
struct cna_node *cn = (struct cna_node *)grab_mcs_node(base, i);
/*
*cpu_llc_id is not initialized when when this function is called
*so just set a fake llc id.
*/
cn->real_llc_id = DEFAULT_LLC_ID;
cn->encoded_tail = encode_tail(cpu, i);
/*
* make sure @encoded_tail is not confused with other valid
* values for @locked (0 or 1)
*/
WARN_ON(cn->encoded_tail <= 1);
}
}
/*
* must be called after cpu_llc_id is initialized.
*/
void cna_set_llc_id_per_cpu(unsigned int cpu)
{
struct mcs_spinlock *base = per_cpu_ptr(&qnodes[0].mcs, cpu);
int i;
if (!use_llc_spinlock)
return;
for (i = 0; i < MAX_NODES; i++) {
struct cna_node *cn = (struct cna_node *)grab_mcs_node(base, i);
cn->real_llc_id = per_cpu(cpu_llc_id, cpu);
}
}
static int __init cna_init_nodes(void)
{
unsigned int cpu;
/*
* this will break on 32bit architectures, so we restrict
* the use of CNA to 64bit only (see arch/x86/Kconfig)
*/
BUILD_BUG_ON(sizeof(struct cna_node) > sizeof(struct qnode));
/* we store an ecoded tail word in the node's @locked field */
BUILD_BUG_ON(sizeof(u32) > sizeof(unsigned int));
for_each_possible_cpu(cpu)
cna_init_nodes_per_cpu(cpu);
return 0;
}
static __always_inline void cna_init_node(struct mcs_spinlock *node)
{
bool priority = !in_task() || irqs_disabled() || rt_task(current);
struct cna_node *cn = (struct cna_node *)node;
cn->llc_id = priority ? CNA_PRIORITY_NODE : cn->real_llc_id;
cn->start_time = 0;
}
/*
* cna_splice_head -- splice the entire secondary queue onto the head of the
* primary queue.
*
* Returns the new primary head node or NULL on failure.
*/
static struct mcs_spinlock *
cna_splice_head(struct qspinlock *lock, u32 val,
struct mcs_spinlock *node, struct mcs_spinlock *next)
{
struct mcs_spinlock *head_2nd, *tail_2nd;
u32 new;
tail_2nd = decode_tail(node->locked);
head_2nd = tail_2nd->next;
if (next) {
/*
* If the primary queue is not empty, the primary tail doesn't
* need to change and we can simply link the secondary tail to
* the old primary head.
*/
tail_2nd->next = next;
} else {
/*
* When the primary queue is empty, the secondary tail becomes
* the primary tail.
*/
/*
* Speculatively break the secondary queue's circular link such
* that when the secondary tail becomes the primary tail it all
* works out.
*/
tail_2nd->next = NULL;
/*
* tail_2nd->next = NULL; old = xchg_tail(lock, tail);
* prev = decode_tail(old);
* try_cmpxchg_release(...); WRITE_ONCE(prev->next, node);
*
* If the following cmpxchg() succeeds, our stores will not
* collide.
*/
new = ((struct cna_node *)tail_2nd)->encoded_tail |
_Q_LOCKED_VAL;
if (!atomic_try_cmpxchg_release(&lock->val, &val, new)) {
/* Restore the secondary queue's circular link. */
tail_2nd->next = head_2nd;
return NULL;
}
}
/* The primary queue head now is what was the secondary queue head. */
return head_2nd;
}
static inline bool cna_try_clear_tail(struct qspinlock *lock, u32 val,
struct mcs_spinlock *node)
{
/*
* We're here because the primary queue is empty; check the secondary
* queue for remote waiters.
*/
if (node->locked > 1) {
struct mcs_spinlock *next;
/*
* When there are waiters on the secondary queue, try to move
* them back onto the primary queue and let them rip.
*/
next = cna_splice_head(lock, val, node, NULL);
if (next) {
arch_mcs_lock_handoff(&next->locked, 1);
return true;
}
return false;
}
/* Both queues are empty. Do what MCS does. */
return __try_clear_tail(lock, val, node);
}
/*
* cna_splice_next -- splice the next node from the primary queue onto
* the secondary queue.
*/
static void cna_splice_next(struct mcs_spinlock *node,
struct mcs_spinlock *next,
struct mcs_spinlock *nnext)
{
/* remove 'next' from the main queue */
node->next = nnext;
/* stick `next` on the secondary queue tail */
if (node->locked <= 1) { /* if secondary queue is empty */
struct cna_node *cn = (struct cna_node *)node;
/* create secondary queue */
next->next = next;
cn->start_time = local_clock();
/* secondary queue is not empty iff start_time != 0 */
WARN_ON(!cn->start_time);
} else {
/* add to the tail of the secondary queue */
struct mcs_spinlock *tail_2nd = decode_tail(node->locked);
struct mcs_spinlock *head_2nd = tail_2nd->next;
tail_2nd->next = next;
next->next = head_2nd;
}
node->locked = ((struct cna_node *)next)->encoded_tail;
}
/*
* cna_order_queue - check whether the next waiter in the main queue is on
* the same NUMA node as the lock holder; if not, and it has a waiter behind
* it in the main queue, move the former onto the secondary queue.
* Returns 1 if the next waiter runs on the same NUMA node; 0 otherwise.
*/
static int cna_order_queue(struct mcs_spinlock *node)
{
struct mcs_spinlock *next = READ_ONCE(node->next);
struct cna_node *cn = (struct cna_node *)node;
int llc_id, next_llc_id;
if (!next)
return 0;
llc_id = cn->llc_id;
next_llc_id = ((struct cna_node *)next)->llc_id;
if (next_llc_id != llc_id && next_llc_id != CNA_PRIORITY_NODE) {
struct mcs_spinlock *nnext = READ_ONCE(next->next);
if (nnext)
cna_splice_next(node, next, nnext);
return 0;
}
return 1;
}
#define LOCK_IS_BUSY(lock) (atomic_read(&(lock)->val) & _Q_LOCKED_PENDING_MASK)
/* Abuse the pv_wait_head_or_lock() hook to get some work done */
static __always_inline u32 cna_wait_head_or_lock(struct qspinlock *lock,
struct mcs_spinlock *node)
{
struct cna_node *cn = (struct cna_node *)node;
if (node->locked <= 1 && probably(SHUFFLE_REDUCTION_PROB_ARG)) {
/*
* When the secondary queue is empty, skip the calls to
* cna_order_queue() below with high probability. This optimization
* reduces the overhead of unnecessary shuffling of threads
* between waiting queues when the lock is only lightly contended.
*/
return 0;
}
if (!cn->start_time || !intra_node_threshold_reached(cn)) {
/*
* We are at the head of the wait queue, no need to use
* the fake NUMA node ID.
*/
if (cn->llc_id == CNA_PRIORITY_NODE)
cn->llc_id = cn->real_llc_id;
/*
* Try and put the time otherwise spent spin waiting on
* _Q_LOCKED_PENDING_MASK to use by sorting our lists.
*/
while (LOCK_IS_BUSY(lock) && !cna_order_queue(node))
cpu_relax();
} else {
cn->start_time = FLUSH_SECONDARY_QUEUE;
}
return 0; /* we lied; we didn't wait, go do so now */
}
static inline void cna_lock_handoff(struct mcs_spinlock *node,
struct mcs_spinlock *next)
{
struct cna_node *cn = (struct cna_node *)node;
u32 val = 1;
if (cn->start_time != FLUSH_SECONDARY_QUEUE) {
if (node->locked > 1) {
val = node->locked; /* preseve secondary queue */
/*
* We have a local waiter, either real or fake one;
* reload @next in case it was changed by cna_order_queue().
*/
next = node->next;
/*
* Pass over NUMA node id of primary queue, to maintain the
* preference even if the next waiter is on a different node.
*/
((struct cna_node *)next)->llc_id = cn->llc_id;
((struct cna_node *)next)->start_time = cn->start_time;
}
} else {
/*
* We decided to flush the secondary queue;
* this can only happen if that queue is not empty.
*/
WARN_ON(node->locked <= 1);
/*
* Splice the secondary queue onto the primary queue and pass the lock
* to the longest waiting remote waiter.
*/
next = cna_splice_head(NULL, 0, node, next);
}
arch_mcs_lock_handoff(&next->locked, val);
}
/*
* Constant (boot-param configurable) flag selecting the LLC-aware variant
* of spinlock. Possible values: -1 (off) / 0 (auto, default) / 1 (on).
*/
static int llc_spinlock_flag = -1;
static int __init llc_spinlock_setup(char *str)
{
if (!strcmp(str, "auto")) {
llc_spinlock_flag = 0;
return 1;
} else if (!strcmp(str, "on")) {
llc_spinlock_flag = 1;
return 1;
} else if (!strcmp(str, "off")) {
llc_spinlock_flag = -1;
return 1;
}
return 0;
}
__setup("llc_spinlock=", llc_spinlock_setup);
void __cna_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
/*
* Switch to the llc-friendly slow path for spinlocks when we have
* multiple llc nodes in native environment, unless the user has
* overridden this default behavior by setting the llc_spinlock flag.
*/
void __init cna_configure_spin_lock_slowpath(void)
{
if (llc_spinlock_flag < 0)
return;
if (llc_spinlock_flag == 0 &&
pv_ops.lock.queued_spin_lock_slowpath !=
native_queued_spin_lock_slowpath)
return;
cna_init_nodes();
use_llc_spinlock = true;
pv_ops.lock.queued_spin_lock_slowpath = __cna_queued_spin_lock_slowpath;
pr_info("Enabling CNA spinlock\n");
}