iommu/arm-smmu-v3: Reduce contention during command-queue insertion
The SMMU command queue is a bottleneck in large systems, thanks to the spin_lock which serialises accesses from all CPUs to the single queue supported by the hardware. Attempt to improve this situation by moving to a new algorithm for inserting commands into the queue, which is lock-free on the fast-path. Tested-by: Ganapatrao Kulkarni <gkulkarni@marvell.com> Signed-off-by: Will Deacon <will@kernel.org>
This commit is contained in:
parent
7c288a5b27
commit
587e6c10a7
|
@ -183,7 +183,7 @@
|
|||
|
||||
#define Q_IDX(llq, p) ((p) & ((1 << (llq)->max_n_shift) - 1))
|
||||
#define Q_WRP(llq, p) ((p) & (1 << (llq)->max_n_shift))
|
||||
#define Q_OVERFLOW_FLAG (1 << 31)
|
||||
#define Q_OVERFLOW_FLAG (1U << 31)
|
||||
#define Q_OVF(p) ((p) & Q_OVERFLOW_FLAG)
|
||||
#define Q_ENT(q, p) ((q)->base + \
|
||||
Q_IDX(&((q)->llq), p) * \
|
||||
|
@ -307,6 +307,8 @@
|
|||
#define CMDQ_ERR_CERROR_ABT_IDX 2
|
||||
#define CMDQ_ERR_CERROR_ATC_INV_IDX 3
|
||||
|
||||
#define CMDQ_PROD_OWNED_FLAG Q_OVERFLOW_FLAG
|
||||
|
||||
#define CMDQ_0_OP GENMASK_ULL(7, 0)
|
||||
#define CMDQ_0_SSV (1UL << 11)
|
||||
|
||||
|
@ -369,9 +371,8 @@
|
|||
#define PRIQ_1_ADDR_MASK GENMASK_ULL(63, 12)
|
||||
|
||||
/* High-level queue structures */
|
||||
#define ARM_SMMU_POLL_TIMEOUT_US 100
|
||||
#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US 1000000 /* 1s! */
|
||||
#define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT 10
|
||||
#define ARM_SMMU_POLL_TIMEOUT_US 1000000 /* 1s! */
|
||||
#define ARM_SMMU_POLL_SPIN_COUNT 10
|
||||
|
||||
#define MSI_IOVA_BASE 0x8000000
|
||||
#define MSI_IOVA_LENGTH 0x100000
|
||||
|
@ -473,15 +474,24 @@ struct arm_smmu_cmdq_ent {
|
|||
|
||||
#define CMDQ_OP_CMD_SYNC 0x46
|
||||
struct {
|
||||
u32 msidata;
|
||||
u64 msiaddr;
|
||||
} sync;
|
||||
};
|
||||
};
|
||||
|
||||
struct arm_smmu_ll_queue {
|
||||
u32 prod;
|
||||
u32 cons;
|
||||
union {
|
||||
u64 val;
|
||||
struct {
|
||||
u32 prod;
|
||||
u32 cons;
|
||||
};
|
||||
struct {
|
||||
atomic_t prod;
|
||||
atomic_t cons;
|
||||
} atomic;
|
||||
u8 __pad[SMP_CACHE_BYTES];
|
||||
} ____cacheline_aligned_in_smp;
|
||||
u32 max_n_shift;
|
||||
};
|
||||
|
||||
|
@ -499,9 +509,18 @@ struct arm_smmu_queue {
|
|||
u32 __iomem *cons_reg;
|
||||
};
|
||||
|
||||
struct arm_smmu_queue_poll {
|
||||
ktime_t timeout;
|
||||
unsigned int delay;
|
||||
unsigned int spin_cnt;
|
||||
bool wfe;
|
||||
};
|
||||
|
||||
struct arm_smmu_cmdq {
|
||||
struct arm_smmu_queue q;
|
||||
spinlock_t lock;
|
||||
atomic_long_t *valid_map;
|
||||
atomic_t owner_prod;
|
||||
atomic_t lock;
|
||||
};
|
||||
|
||||
struct arm_smmu_evtq {
|
||||
|
@ -581,8 +600,6 @@ struct arm_smmu_device {
|
|||
|
||||
int gerr_irq;
|
||||
int combined_irq;
|
||||
u32 sync_nr;
|
||||
u8 prev_cmd_opcode;
|
||||
|
||||
unsigned long ias; /* IPA */
|
||||
unsigned long oas; /* PA */
|
||||
|
@ -601,12 +618,6 @@ struct arm_smmu_device {
|
|||
|
||||
struct arm_smmu_strtab_cfg strtab_cfg;
|
||||
|
||||
/* Hi16xx adds an extra 32 bits of goodness to its MSI payload */
|
||||
union {
|
||||
u32 sync_count;
|
||||
u64 padding;
|
||||
};
|
||||
|
||||
/* IOMMU core code handle */
|
||||
struct iommu_device iommu;
|
||||
};
|
||||
|
@ -690,6 +701,21 @@ static void parse_driver_options(struct arm_smmu_device *smmu)
|
|||
}
|
||||
|
||||
/* Low-level queue manipulation functions */
|
||||
static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n)
|
||||
{
|
||||
u32 space, prod, cons;
|
||||
|
||||
prod = Q_IDX(q, q->prod);
|
||||
cons = Q_IDX(q, q->cons);
|
||||
|
||||
if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons))
|
||||
space = (1 << q->max_n_shift) - (prod - cons);
|
||||
else
|
||||
space = cons - prod;
|
||||
|
||||
return space >= n;
|
||||
}
|
||||
|
||||
static bool queue_full(struct arm_smmu_ll_queue *q)
|
||||
{
|
||||
return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
|
||||
|
@ -702,9 +728,12 @@ static bool queue_empty(struct arm_smmu_ll_queue *q)
|
|||
Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
|
||||
}
|
||||
|
||||
static void queue_sync_cons_in(struct arm_smmu_queue *q)
|
||||
static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod)
|
||||
{
|
||||
q->llq.cons = readl_relaxed(q->cons_reg);
|
||||
return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) &&
|
||||
(Q_IDX(q, q->cons) > Q_IDX(q, prod))) ||
|
||||
((Q_WRP(q, q->cons) != Q_WRP(q, prod)) &&
|
||||
(Q_IDX(q, q->cons) <= Q_IDX(q, prod)));
|
||||
}
|
||||
|
||||
static void queue_sync_cons_out(struct arm_smmu_queue *q)
|
||||
|
@ -735,46 +764,34 @@ static int queue_sync_prod_in(struct arm_smmu_queue *q)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static void queue_sync_prod_out(struct arm_smmu_queue *q)
|
||||
static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n)
|
||||
{
|
||||
writel(q->llq.prod, q->prod_reg);
|
||||
u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n;
|
||||
return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
|
||||
}
|
||||
|
||||
static void queue_inc_prod(struct arm_smmu_ll_queue *q)
|
||||
static void queue_poll_init(struct arm_smmu_device *smmu,
|
||||
struct arm_smmu_queue_poll *qp)
|
||||
{
|
||||
u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
|
||||
q->prod = Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
|
||||
qp->delay = 1;
|
||||
qp->spin_cnt = 0;
|
||||
qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
|
||||
qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for the SMMU to consume items. If sync is true, wait until the queue
|
||||
* is empty. Otherwise, wait until there is at least one free slot.
|
||||
*/
|
||||
static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
|
||||
static int queue_poll(struct arm_smmu_queue_poll *qp)
|
||||
{
|
||||
ktime_t timeout;
|
||||
unsigned int delay = 1, spin_cnt = 0;
|
||||
if (ktime_compare(ktime_get(), qp->timeout) > 0)
|
||||
return -ETIMEDOUT;
|
||||
|
||||
/* Wait longer if it's a CMD_SYNC */
|
||||
timeout = ktime_add_us(ktime_get(), sync ?
|
||||
ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
|
||||
ARM_SMMU_POLL_TIMEOUT_US);
|
||||
|
||||
while (queue_sync_cons_in(q),
|
||||
(sync ? !queue_empty(&q->llq) : queue_full(&q->llq))) {
|
||||
if (ktime_compare(ktime_get(), timeout) > 0)
|
||||
return -ETIMEDOUT;
|
||||
|
||||
if (wfe) {
|
||||
wfe();
|
||||
} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
|
||||
cpu_relax();
|
||||
continue;
|
||||
} else {
|
||||
udelay(delay);
|
||||
delay *= 2;
|
||||
spin_cnt = 0;
|
||||
}
|
||||
if (qp->wfe) {
|
||||
wfe();
|
||||
} else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) {
|
||||
cpu_relax();
|
||||
} else {
|
||||
udelay(qp->delay);
|
||||
qp->delay *= 2;
|
||||
qp->spin_cnt = 0;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -788,17 +805,6 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
|
|||
*dst++ = cpu_to_le64(*src++);
|
||||
}
|
||||
|
||||
static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
|
||||
{
|
||||
if (queue_full(&q->llq))
|
||||
return -ENOSPC;
|
||||
|
||||
queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords);
|
||||
queue_inc_prod(&q->llq);
|
||||
queue_sync_prod_out(q);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void queue_read(__le64 *dst, u64 *src, size_t n_dwords)
|
||||
{
|
||||
int i;
|
||||
|
@ -881,20 +887,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
|
|||
cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
|
||||
break;
|
||||
case CMDQ_OP_CMD_SYNC:
|
||||
if (ent->sync.msiaddr)
|
||||
if (ent->sync.msiaddr) {
|
||||
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ);
|
||||
else
|
||||
cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
|
||||
} else {
|
||||
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
|
||||
}
|
||||
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH);
|
||||
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB);
|
||||
/*
|
||||
* Commands are written little-endian, but we want the SMMU to
|
||||
* receive MSIData, and thus write it back to memory, in CPU
|
||||
* byte order, so big-endian needs an extra byteswap here.
|
||||
*/
|
||||
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA,
|
||||
cpu_to_le32(ent->sync.msidata));
|
||||
cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
|
||||
break;
|
||||
default:
|
||||
return -ENOENT;
|
||||
|
@ -903,6 +903,27 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
|
||||
u32 prod)
|
||||
{
|
||||
struct arm_smmu_queue *q = &smmu->cmdq.q;
|
||||
struct arm_smmu_cmdq_ent ent = {
|
||||
.opcode = CMDQ_OP_CMD_SYNC,
|
||||
};
|
||||
|
||||
/*
|
||||
* Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
|
||||
* payload, so the write will zero the entire command on that platform.
|
||||
*/
|
||||
if (smmu->features & ARM_SMMU_FEAT_MSI &&
|
||||
smmu->features & ARM_SMMU_FEAT_COHERENCY) {
|
||||
ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
|
||||
q->ent_dwords * 8;
|
||||
}
|
||||
|
||||
arm_smmu_cmdq_build_cmd(cmd, &ent);
|
||||
}
|
||||
|
||||
static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
|
||||
{
|
||||
static const char *cerror_str[] = {
|
||||
|
@ -961,109 +982,440 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
|
|||
queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
|
||||
}
|
||||
|
||||
static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
|
||||
/*
|
||||
* Command queue locking.
|
||||
* This is a form of bastardised rwlock with the following major changes:
|
||||
*
|
||||
* - The only LOCK routines are exclusive_trylock() and shared_lock().
|
||||
* Neither have barrier semantics, and instead provide only a control
|
||||
* dependency.
|
||||
*
|
||||
* - The UNLOCK routines are supplemented with shared_tryunlock(), which
|
||||
* fails if the caller appears to be the last lock holder (yes, this is
|
||||
* racy). All successful UNLOCK routines have RELEASE semantics.
|
||||
*/
|
||||
static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq)
|
||||
{
|
||||
struct arm_smmu_queue *q = &smmu->cmdq.q;
|
||||
bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
|
||||
int val;
|
||||
|
||||
smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]);
|
||||
/*
|
||||
* We can try to avoid the cmpxchg() loop by simply incrementing the
|
||||
* lock counter. When held in exclusive state, the lock counter is set
|
||||
* to INT_MIN so these increments won't hurt as the value will remain
|
||||
* negative.
|
||||
*/
|
||||
if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0)
|
||||
return;
|
||||
|
||||
while (queue_insert_raw(q, cmd) == -ENOSPC) {
|
||||
if (queue_poll_cons(q, false, wfe))
|
||||
dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
|
||||
do {
|
||||
val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0);
|
||||
} while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val);
|
||||
}
|
||||
|
||||
static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq)
|
||||
{
|
||||
(void)atomic_dec_return_release(&cmdq->lock);
|
||||
}
|
||||
|
||||
static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq)
|
||||
{
|
||||
if (atomic_read(&cmdq->lock) == 1)
|
||||
return false;
|
||||
|
||||
arm_smmu_cmdq_shared_unlock(cmdq);
|
||||
return true;
|
||||
}
|
||||
|
||||
#define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags) \
|
||||
({ \
|
||||
bool __ret; \
|
||||
local_irq_save(flags); \
|
||||
__ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN); \
|
||||
if (!__ret) \
|
||||
local_irq_restore(flags); \
|
||||
__ret; \
|
||||
})
|
||||
|
||||
#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags) \
|
||||
({ \
|
||||
atomic_set_release(&cmdq->lock, 0); \
|
||||
local_irq_restore(flags); \
|
||||
})
|
||||
|
||||
|
||||
/*
|
||||
* Command queue insertion.
|
||||
* This is made fiddly by our attempts to achieve some sort of scalability
|
||||
* since there is one queue shared amongst all of the CPUs in the system. If
|
||||
* you like mixed-size concurrency, dependency ordering and relaxed atomics,
|
||||
* then you'll *love* this monstrosity.
|
||||
*
|
||||
* The basic idea is to split the queue up into ranges of commands that are
|
||||
* owned by a given CPU; the owner may not have written all of the commands
|
||||
* itself, but is responsible for advancing the hardware prod pointer when
|
||||
* the time comes. The algorithm is roughly:
|
||||
*
|
||||
* 1. Allocate some space in the queue. At this point we also discover
|
||||
* whether the head of the queue is currently owned by another CPU,
|
||||
* or whether we are the owner.
|
||||
*
|
||||
* 2. Write our commands into our allocated slots in the queue.
|
||||
*
|
||||
* 3. Mark our slots as valid in arm_smmu_cmdq.valid_map.
|
||||
*
|
||||
* 4. If we are an owner:
|
||||
* a. Wait for the previous owner to finish.
|
||||
* b. Mark the queue head as unowned, which tells us the range
|
||||
* that we are responsible for publishing.
|
||||
* c. Wait for all commands in our owned range to become valid.
|
||||
* d. Advance the hardware prod pointer.
|
||||
* e. Tell the next owner we've finished.
|
||||
*
|
||||
* 5. If we are inserting a CMD_SYNC (we may or may not have been an
|
||||
* owner), then we need to stick around until it has completed:
|
||||
* a. If we have MSIs, the SMMU can write back into the CMD_SYNC
|
||||
* to clear the first 4 bytes.
|
||||
* b. Otherwise, we spin waiting for the hardware cons pointer to
|
||||
* advance past our command.
|
||||
*
|
||||
* The devil is in the details, particularly the use of locking for handling
|
||||
* SYNC completion and freeing up space in the queue before we think that it is
|
||||
* full.
|
||||
*/
|
||||
static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq,
|
||||
u32 sprod, u32 eprod, bool set)
|
||||
{
|
||||
u32 swidx, sbidx, ewidx, ebidx;
|
||||
struct arm_smmu_ll_queue llq = {
|
||||
.max_n_shift = cmdq->q.llq.max_n_shift,
|
||||
.prod = sprod,
|
||||
};
|
||||
|
||||
ewidx = BIT_WORD(Q_IDX(&llq, eprod));
|
||||
ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG;
|
||||
|
||||
while (llq.prod != eprod) {
|
||||
unsigned long mask;
|
||||
atomic_long_t *ptr;
|
||||
u32 limit = BITS_PER_LONG;
|
||||
|
||||
swidx = BIT_WORD(Q_IDX(&llq, llq.prod));
|
||||
sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG;
|
||||
|
||||
ptr = &cmdq->valid_map[swidx];
|
||||
|
||||
if ((swidx == ewidx) && (sbidx < ebidx))
|
||||
limit = ebidx;
|
||||
|
||||
mask = GENMASK(limit - 1, sbidx);
|
||||
|
||||
/*
|
||||
* The valid bit is the inverse of the wrap bit. This means
|
||||
* that a zero-initialised queue is invalid and, after marking
|
||||
* all entries as valid, they become invalid again when we
|
||||
* wrap.
|
||||
*/
|
||||
if (set) {
|
||||
atomic_long_xor(mask, ptr);
|
||||
} else { /* Poll */
|
||||
unsigned long valid;
|
||||
|
||||
valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask;
|
||||
atomic_long_cond_read_relaxed(ptr, (VAL & mask) == valid);
|
||||
}
|
||||
|
||||
llq.prod = queue_inc_prod_n(&llq, limit - sbidx);
|
||||
}
|
||||
}
|
||||
|
||||
static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
|
||||
struct arm_smmu_cmdq_ent *ent)
|
||||
/* Mark all entries in the range [sprod, eprod) as valid */
|
||||
static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq,
|
||||
u32 sprod, u32 eprod)
|
||||
{
|
||||
__arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true);
|
||||
}
|
||||
|
||||
/* Wait for all entries in the range [sprod, eprod) to become valid */
|
||||
static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq,
|
||||
u32 sprod, u32 eprod)
|
||||
{
|
||||
__arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false);
|
||||
}
|
||||
|
||||
/* Wait for the command queue to become non-full */
|
||||
static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
|
||||
struct arm_smmu_ll_queue *llq)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct arm_smmu_queue_poll qp;
|
||||
struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* Try to update our copy of cons by grabbing exclusive cmdq access. If
|
||||
* that fails, spin until somebody else updates it for us.
|
||||
*/
|
||||
if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) {
|
||||
WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg));
|
||||
arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags);
|
||||
llq->val = READ_ONCE(cmdq->q.llq.val);
|
||||
return 0;
|
||||
}
|
||||
|
||||
queue_poll_init(smmu, &qp);
|
||||
do {
|
||||
llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
|
||||
if (!queue_full(llq))
|
||||
break;
|
||||
|
||||
ret = queue_poll(&qp);
|
||||
} while (!ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait until the SMMU signals a CMD_SYNC completion MSI.
|
||||
* Must be called with the cmdq lock held in some capacity.
|
||||
*/
|
||||
static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
|
||||
struct arm_smmu_ll_queue *llq)
|
||||
{
|
||||
int ret = 0;
|
||||
struct arm_smmu_queue_poll qp;
|
||||
struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
|
||||
u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
|
||||
|
||||
queue_poll_init(smmu, &qp);
|
||||
|
||||
/*
|
||||
* The MSI won't generate an event, since it's being written back
|
||||
* into the command queue.
|
||||
*/
|
||||
qp.wfe = false;
|
||||
smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp)));
|
||||
llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait until the SMMU cons index passes llq->prod.
|
||||
* Must be called with the cmdq lock held in some capacity.
|
||||
*/
|
||||
static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
|
||||
struct arm_smmu_ll_queue *llq)
|
||||
{
|
||||
struct arm_smmu_queue_poll qp;
|
||||
struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
|
||||
u32 prod = llq->prod;
|
||||
int ret = 0;
|
||||
|
||||
queue_poll_init(smmu, &qp);
|
||||
llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
|
||||
do {
|
||||
if (queue_consumed(llq, prod))
|
||||
break;
|
||||
|
||||
ret = queue_poll(&qp);
|
||||
|
||||
/*
|
||||
* This needs to be a readl() so that our subsequent call
|
||||
* to arm_smmu_cmdq_shared_tryunlock() can fail accurately.
|
||||
*
|
||||
* Specifically, we need to ensure that we observe all
|
||||
* shared_lock()s by other CMD_SYNCs that share our owner,
|
||||
* so that a failing call to tryunlock() means that we're
|
||||
* the last one out and therefore we can safely advance
|
||||
* cmdq->q.llq.cons. Roughly speaking:
|
||||
*
|
||||
* CPU 0 CPU1 CPU2 (us)
|
||||
*
|
||||
* if (sync)
|
||||
* shared_lock();
|
||||
*
|
||||
* dma_wmb();
|
||||
* set_valid_map();
|
||||
*
|
||||
* if (owner) {
|
||||
* poll_valid_map();
|
||||
* <control dependency>
|
||||
* writel(prod_reg);
|
||||
*
|
||||
* readl(cons_reg);
|
||||
* tryunlock();
|
||||
*
|
||||
* Requires us to see CPU 0's shared_lock() acquisition.
|
||||
*/
|
||||
llq->cons = readl(cmdq->q.cons_reg);
|
||||
} while (!ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
|
||||
struct arm_smmu_ll_queue *llq)
|
||||
{
|
||||
if (smmu->features & ARM_SMMU_FEAT_MSI &&
|
||||
smmu->features & ARM_SMMU_FEAT_COHERENCY)
|
||||
return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
|
||||
|
||||
return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
|
||||
}
|
||||
|
||||
static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
|
||||
u32 prod, int n)
|
||||
{
|
||||
int i;
|
||||
struct arm_smmu_ll_queue llq = {
|
||||
.max_n_shift = cmdq->q.llq.max_n_shift,
|
||||
.prod = prod,
|
||||
};
|
||||
|
||||
for (i = 0; i < n; ++i) {
|
||||
u64 *cmd = &cmds[i * CMDQ_ENT_DWORDS];
|
||||
|
||||
prod = queue_inc_prod_n(&llq, i);
|
||||
queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS);
|
||||
}
|
||||
}
|
||||
|
||||
static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
|
||||
u64 *cmds, int n, bool sync)
|
||||
{
|
||||
u64 cmd_sync[CMDQ_ENT_DWORDS];
|
||||
u32 prod;
|
||||
unsigned long flags;
|
||||
bool owner;
|
||||
struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
|
||||
struct arm_smmu_ll_queue llq = {
|
||||
.max_n_shift = cmdq->q.llq.max_n_shift,
|
||||
}, head = llq;
|
||||
int ret = 0;
|
||||
|
||||
/* 1. Allocate some space in the queue */
|
||||
local_irq_save(flags);
|
||||
llq.val = READ_ONCE(cmdq->q.llq.val);
|
||||
do {
|
||||
u64 old;
|
||||
|
||||
while (!queue_has_space(&llq, n + sync)) {
|
||||
local_irq_restore(flags);
|
||||
if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
|
||||
dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
|
||||
local_irq_save(flags);
|
||||
}
|
||||
|
||||
head.cons = llq.cons;
|
||||
head.prod = queue_inc_prod_n(&llq, n + sync) |
|
||||
CMDQ_PROD_OWNED_FLAG;
|
||||
|
||||
old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
|
||||
if (old == llq.val)
|
||||
break;
|
||||
|
||||
llq.val = old;
|
||||
} while (1);
|
||||
owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
|
||||
head.prod &= ~CMDQ_PROD_OWNED_FLAG;
|
||||
llq.prod &= ~CMDQ_PROD_OWNED_FLAG;
|
||||
|
||||
/*
|
||||
* 2. Write our commands into the queue
|
||||
* Dependency ordering from the cmpxchg() loop above.
|
||||
*/
|
||||
arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
|
||||
if (sync) {
|
||||
prod = queue_inc_prod_n(&llq, n);
|
||||
arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod);
|
||||
queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);
|
||||
|
||||
/*
|
||||
* In order to determine completion of our CMD_SYNC, we must
|
||||
* ensure that the queue can't wrap twice without us noticing.
|
||||
* We achieve that by taking the cmdq lock as shared before
|
||||
* marking our slot as valid.
|
||||
*/
|
||||
arm_smmu_cmdq_shared_lock(cmdq);
|
||||
}
|
||||
|
||||
/* 3. Mark our slots as valid, ensuring commands are visible first */
|
||||
dma_wmb();
|
||||
arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod);
|
||||
|
||||
/* 4. If we are the owner, take control of the SMMU hardware */
|
||||
if (owner) {
|
||||
/* a. Wait for previous owner to finish */
|
||||
atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod);
|
||||
|
||||
/* b. Stop gathering work by clearing the owned flag */
|
||||
prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG,
|
||||
&cmdq->q.llq.atomic.prod);
|
||||
prod &= ~CMDQ_PROD_OWNED_FLAG;
|
||||
|
||||
/*
|
||||
* c. Wait for any gathered work to be written to the queue.
|
||||
* Note that we read our own entries so that we have the control
|
||||
* dependency required by (d).
|
||||
*/
|
||||
arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod);
|
||||
|
||||
/*
|
||||
* d. Advance the hardware prod pointer
|
||||
* Control dependency ordering from the entries becoming valid.
|
||||
*/
|
||||
writel_relaxed(prod, cmdq->q.prod_reg);
|
||||
|
||||
/*
|
||||
* e. Tell the next owner we're done
|
||||
* Make sure we've updated the hardware first, so that we don't
|
||||
* race to update prod and potentially move it backwards.
|
||||
*/
|
||||
atomic_set_release(&cmdq->owner_prod, prod);
|
||||
}
|
||||
|
||||
/* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
|
||||
if (sync) {
|
||||
llq.prod = queue_inc_prod_n(&llq, n);
|
||||
ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq);
|
||||
if (ret) {
|
||||
dev_err_ratelimited(smmu->dev,
|
||||
"CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
|
||||
llq.prod,
|
||||
readl_relaxed(cmdq->q.prod_reg),
|
||||
readl_relaxed(cmdq->q.cons_reg));
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to unlock the cmq lock. This will fail if we're the last
|
||||
* reader, in which case we can safely update cmdq->q.llq.cons
|
||||
*/
|
||||
if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) {
|
||||
WRITE_ONCE(cmdq->q.llq.cons, llq.cons);
|
||||
arm_smmu_cmdq_shared_unlock(cmdq);
|
||||
}
|
||||
}
|
||||
|
||||
local_irq_restore(flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
|
||||
struct arm_smmu_cmdq_ent *ent)
|
||||
{
|
||||
u64 cmd[CMDQ_ENT_DWORDS];
|
||||
unsigned long flags;
|
||||
|
||||
if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
|
||||
dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
|
||||
ent->opcode);
|
||||
return;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
spin_lock_irqsave(&smmu->cmdq.lock, flags);
|
||||
arm_smmu_cmdq_insert_cmd(smmu, cmd);
|
||||
spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* The difference between val and sync_idx is bounded by the maximum size of
|
||||
* a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
|
||||
*/
|
||||
static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
|
||||
{
|
||||
ktime_t timeout;
|
||||
u32 val;
|
||||
|
||||
timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
|
||||
val = smp_cond_load_acquire(&smmu->sync_count,
|
||||
(int)(VAL - sync_idx) >= 0 ||
|
||||
!ktime_before(ktime_get(), timeout));
|
||||
|
||||
return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
|
||||
}
|
||||
|
||||
static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
|
||||
{
|
||||
u64 cmd[CMDQ_ENT_DWORDS];
|
||||
unsigned long flags;
|
||||
struct arm_smmu_cmdq_ent ent = {
|
||||
.opcode = CMDQ_OP_CMD_SYNC,
|
||||
.sync = {
|
||||
.msiaddr = virt_to_phys(&smmu->sync_count),
|
||||
},
|
||||
};
|
||||
|
||||
spin_lock_irqsave(&smmu->cmdq.lock, flags);
|
||||
|
||||
/* Piggy-back on the previous command if it's a SYNC */
|
||||
if (smmu->prev_cmd_opcode == CMDQ_OP_CMD_SYNC) {
|
||||
ent.sync.msidata = smmu->sync_nr;
|
||||
} else {
|
||||
ent.sync.msidata = ++smmu->sync_nr;
|
||||
arm_smmu_cmdq_build_cmd(cmd, &ent);
|
||||
arm_smmu_cmdq_insert_cmd(smmu, cmd);
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
|
||||
|
||||
return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
|
||||
}
|
||||
|
||||
static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
|
||||
{
|
||||
u64 cmd[CMDQ_ENT_DWORDS];
|
||||
unsigned long flags;
|
||||
bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
|
||||
struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
|
||||
int ret;
|
||||
|
||||
arm_smmu_cmdq_build_cmd(cmd, &ent);
|
||||
|
||||
spin_lock_irqsave(&smmu->cmdq.lock, flags);
|
||||
arm_smmu_cmdq_insert_cmd(smmu, cmd);
|
||||
ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
|
||||
spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
|
||||
|
||||
return ret;
|
||||
return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
|
||||
}
|
||||
|
||||
static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
|
||||
{
|
||||
int ret;
|
||||
bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
|
||||
(smmu->features & ARM_SMMU_FEAT_COHERENCY);
|
||||
|
||||
ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
|
||||
: __arm_smmu_cmdq_issue_sync(smmu);
|
||||
if (ret)
|
||||
dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
|
||||
return ret;
|
||||
return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
|
||||
}
|
||||
|
||||
/* Context descriptor manipulation functions */
|
||||
|
@ -1580,9 +1932,9 @@ static void arm_smmu_tlb_inv_context(void *cookie)
|
|||
/*
|
||||
* NOTE: when io-pgtable is in non-strict mode, we may get here with
|
||||
* PTEs previously cleared by unmaps on the current CPU not yet visible
|
||||
* to the SMMU. We are relying on the DSB implicit in
|
||||
* queue_sync_prod_out() to guarantee those are observed before the
|
||||
* TLBI. Do be careful, 007.
|
||||
* to the SMMU. We are relying on the dma_wmb() implicit during cmd
|
||||
* insertion to guarantee those are observed before the TLBI. Do be
|
||||
* careful, 007.
|
||||
*/
|
||||
arm_smmu_cmdq_issue_cmd(smmu, &cmd);
|
||||
arm_smmu_cmdq_issue_sync(smmu);
|
||||
|
@ -2359,18 +2711,49 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void arm_smmu_cmdq_free_bitmap(void *data)
|
||||
{
|
||||
unsigned long *bitmap = data;
|
||||
bitmap_free(bitmap);
|
||||
}
|
||||
|
||||
static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
|
||||
{
|
||||
int ret = 0;
|
||||
struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
|
||||
unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
|
||||
atomic_long_t *bitmap;
|
||||
|
||||
atomic_set(&cmdq->owner_prod, 0);
|
||||
atomic_set(&cmdq->lock, 0);
|
||||
|
||||
bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
|
||||
if (!bitmap) {
|
||||
dev_err(smmu->dev, "failed to allocate cmdq bitmap\n");
|
||||
ret = -ENOMEM;
|
||||
} else {
|
||||
cmdq->valid_map = bitmap;
|
||||
devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* cmdq */
|
||||
spin_lock_init(&smmu->cmdq.lock);
|
||||
ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD,
|
||||
ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS,
|
||||
"cmdq");
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = arm_smmu_cmdq_init(smmu);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* evtq */
|
||||
ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
|
||||
ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS,
|
||||
|
@ -2951,9 +3334,15 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
|
|||
/* Queue sizes, capped to ensure natural alignment */
|
||||
smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT,
|
||||
FIELD_GET(IDR1_CMDQS, reg));
|
||||
if (!smmu->cmdq.q.llq.max_n_shift) {
|
||||
/* Odd alignment restrictions on the base, so ignore for now */
|
||||
dev_err(smmu->dev, "unit-length command queue not supported\n");
|
||||
if (smmu->cmdq.q.llq.max_n_shift < ilog2(BITS_PER_LONG)) {
|
||||
/*
|
||||
* The cmdq valid_map relies on the total number of entries
|
||||
* being a multiple of BITS_PER_LONG. There's also no way
|
||||
* we can handle the weird alignment restrictions on the
|
||||
* base pointer for a unit-length queue.
|
||||
*/
|
||||
dev_err(smmu->dev, "command queue size < %d entries not supported\n",
|
||||
BITS_PER_LONG);
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue