summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWill Deacon <will@kernel.org>2019-07-02 18:16:25 +0200
committerWill Deacon <will@kernel.org>2019-08-08 14:31:54 +0200
commit587e6c10a7ce89a5924fdbeff2ec524fbd6a124b (patch)
tree6cc02601cd05efcfebb712a740a22cea60784e1e
parentiommu/arm-smmu-v3: Operate directly on low-level queue where possible (diff)
downloadlinux-587e6c10a7ce89a5924fdbeff2ec524fbd6a124b.tar.xz
linux-587e6c10a7ce89a5924fdbeff2ec524fbd6a124b.zip
iommu/arm-smmu-v3: Reduce contention during command-queue insertion
The SMMU command queue is a bottleneck in large systems, thanks to the spin_lock which serialises accesses from all CPUs to the single queue supported by the hardware. Attempt to improve this situation by moving to a new algorithm for inserting commands into the queue, which is lock-free on the fast-path. Tested-by: Ganapatrao Kulkarni <gkulkarni@marvell.com> Signed-off-by: Will Deacon <will@kernel.org>
-rw-r--r--drivers/iommu/arm-smmu-v3.c677
1 files changed, 533 insertions, 144 deletions
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 9ebb8b39a3b1..202b4b6fc70a 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -183,7 +183,7 @@
#define Q_IDX(llq, p) ((p) & ((1 << (llq)->max_n_shift) - 1))
#define Q_WRP(llq, p) ((p) & (1 << (llq)->max_n_shift))
-#define Q_OVERFLOW_FLAG (1 << 31)
+#define Q_OVERFLOW_FLAG (1U << 31)
#define Q_OVF(p) ((p) & Q_OVERFLOW_FLAG)
#define Q_ENT(q, p) ((q)->base + \
Q_IDX(&((q)->llq), p) * \
@@ -307,6 +307,8 @@
#define CMDQ_ERR_CERROR_ABT_IDX 2
#define CMDQ_ERR_CERROR_ATC_INV_IDX 3
+#define CMDQ_PROD_OWNED_FLAG Q_OVERFLOW_FLAG
+
#define CMDQ_0_OP GENMASK_ULL(7, 0)
#define CMDQ_0_SSV (1UL << 11)
@@ -369,9 +371,8 @@
#define PRIQ_1_ADDR_MASK GENMASK_ULL(63, 12)
/* High-level queue structures */
-#define ARM_SMMU_POLL_TIMEOUT_US 100
-#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US 1000000 /* 1s! */
-#define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT 10
+#define ARM_SMMU_POLL_TIMEOUT_US 1000000 /* 1s! */
+#define ARM_SMMU_POLL_SPIN_COUNT 10
#define MSI_IOVA_BASE 0x8000000
#define MSI_IOVA_LENGTH 0x100000
@@ -473,15 +474,24 @@ struct arm_smmu_cmdq_ent {
#define CMDQ_OP_CMD_SYNC 0x46
struct {
- u32 msidata;
u64 msiaddr;
} sync;
};
};
struct arm_smmu_ll_queue {
- u32 prod;
- u32 cons;
+ union {
+ u64 val;
+ struct {
+ u32 prod;
+ u32 cons;
+ };
+ struct {
+ atomic_t prod;
+ atomic_t cons;
+ } atomic;
+ u8 __pad[SMP_CACHE_BYTES];
+ } ____cacheline_aligned_in_smp;
u32 max_n_shift;
};
@@ -499,9 +509,18 @@ struct arm_smmu_queue {
u32 __iomem *cons_reg;
};
+struct arm_smmu_queue_poll {
+ ktime_t timeout;
+ unsigned int delay;
+ unsigned int spin_cnt;
+ bool wfe;
+};
+
struct arm_smmu_cmdq {
struct arm_smmu_queue q;
- spinlock_t lock;
+ atomic_long_t *valid_map;
+ atomic_t owner_prod;
+ atomic_t lock;
};
struct arm_smmu_evtq {
@@ -581,8 +600,6 @@ struct arm_smmu_device {
int gerr_irq;
int combined_irq;
- u32 sync_nr;
- u8 prev_cmd_opcode;
unsigned long ias; /* IPA */
unsigned long oas; /* PA */
@@ -601,12 +618,6 @@ struct arm_smmu_device {
struct arm_smmu_strtab_cfg strtab_cfg;
- /* Hi16xx adds an extra 32 bits of goodness to its MSI payload */
- union {
- u32 sync_count;
- u64 padding;
- };
-
/* IOMMU core code handle */
struct iommu_device iommu;
};
@@ -690,6 +701,21 @@ static void parse_driver_options(struct arm_smmu_device *smmu)
}
/* Low-level queue manipulation functions */
+static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n)
+{
+ u32 space, prod, cons;
+
+ prod = Q_IDX(q, q->prod);
+ cons = Q_IDX(q, q->cons);
+
+ if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons))
+ space = (1 << q->max_n_shift) - (prod - cons);
+ else
+ space = cons - prod;
+
+ return space >= n;
+}
+
static bool queue_full(struct arm_smmu_ll_queue *q)
{
return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
@@ -702,9 +728,12 @@ static bool queue_empty(struct arm_smmu_ll_queue *q)
Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
}
-static void queue_sync_cons_in(struct arm_smmu_queue *q)
+static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod)
{
- q->llq.cons = readl_relaxed(q->cons_reg);
+ return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) &&
+ (Q_IDX(q, q->cons) > Q_IDX(q, prod))) ||
+ ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) &&
+ (Q_IDX(q, q->cons) <= Q_IDX(q, prod)));
}
static void queue_sync_cons_out(struct arm_smmu_queue *q)
@@ -735,46 +764,34 @@ static int queue_sync_prod_in(struct arm_smmu_queue *q)
return ret;
}
-static void queue_sync_prod_out(struct arm_smmu_queue *q)
+static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n)
{
- writel(q->llq.prod, q->prod_reg);
+ u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n;
+ return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
}
-static void queue_inc_prod(struct arm_smmu_ll_queue *q)
+static void queue_poll_init(struct arm_smmu_device *smmu,
+ struct arm_smmu_queue_poll *qp)
{
- u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
- q->prod = Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
+ qp->delay = 1;
+ qp->spin_cnt = 0;
+ qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+ qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
}
-/*
- * Wait for the SMMU to consume items. If sync is true, wait until the queue
- * is empty. Otherwise, wait until there is at least one free slot.
- */
-static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
+static int queue_poll(struct arm_smmu_queue_poll *qp)
{
- ktime_t timeout;
- unsigned int delay = 1, spin_cnt = 0;
-
- /* Wait longer if it's a CMD_SYNC */
- timeout = ktime_add_us(ktime_get(), sync ?
- ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
- ARM_SMMU_POLL_TIMEOUT_US);
-
- while (queue_sync_cons_in(q),
- (sync ? !queue_empty(&q->llq) : queue_full(&q->llq))) {
- if (ktime_compare(ktime_get(), timeout) > 0)
- return -ETIMEDOUT;
+ if (ktime_compare(ktime_get(), qp->timeout) > 0)
+ return -ETIMEDOUT;
- if (wfe) {
- wfe();
- } else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
- cpu_relax();
- continue;
- } else {
- udelay(delay);
- delay *= 2;
- spin_cnt = 0;
- }
+ if (qp->wfe) {
+ wfe();
+ } else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) {
+ cpu_relax();
+ } else {
+ udelay(qp->delay);
+ qp->delay *= 2;
+ qp->spin_cnt = 0;
}
return 0;
@@ -788,17 +805,6 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
*dst++ = cpu_to_le64(*src++);
}
-static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
-{
- if (queue_full(&q->llq))
- return -ENOSPC;
-
- queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords);
- queue_inc_prod(&q->llq);
- queue_sync_prod_out(q);
- return 0;
-}
-
static void queue_read(__le64 *dst, u64 *src, size_t n_dwords)
{
int i;
@@ -881,20 +887,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
break;
case CMDQ_OP_CMD_SYNC:
- if (ent->sync.msiaddr)
+ if (ent->sync.msiaddr) {
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ);
- else
+ cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
+ } else {
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
+ }
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH);
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB);
- /*
- * Commands are written little-endian, but we want the SMMU to
- * receive MSIData, and thus write it back to memory, in CPU
- * byte order, so big-endian needs an extra byteswap here.
- */
- cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA,
- cpu_to_le32(ent->sync.msidata));
- cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
break;
default:
return -ENOENT;
@@ -903,6 +903,27 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
return 0;
}
+static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
+ u32 prod)
+{
+ struct arm_smmu_queue *q = &smmu->cmdq.q;
+ struct arm_smmu_cmdq_ent ent = {
+ .opcode = CMDQ_OP_CMD_SYNC,
+ };
+
+ /*
+ * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
+ * payload, so the write will zero the entire command on that platform.
+ */
+ if (smmu->features & ARM_SMMU_FEAT_MSI &&
+ smmu->features & ARM_SMMU_FEAT_COHERENCY) {
+ ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
+ q->ent_dwords * 8;
+ }
+
+ arm_smmu_cmdq_build_cmd(cmd, &ent);
+}
+
static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
{
static const char *cerror_str[] = {
@@ -961,109 +982,440 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
}
-static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
+/*
+ * Command queue locking.
+ * This is a form of bastardised rwlock with the following major changes:
+ *
+ * - The only LOCK routines are exclusive_trylock() and shared_lock().
+ * Neither have barrier semantics, and instead provide only a control
+ * dependency.
+ *
+ * - The UNLOCK routines are supplemented with shared_tryunlock(), which
+ * fails if the caller appears to be the last lock holder (yes, this is
+ * racy). All successful UNLOCK routines have RELEASE semantics.
+ */
+static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq)
{
- struct arm_smmu_queue *q = &smmu->cmdq.q;
- bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+ int val;
- smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]);
+ /*
+ * We can try to avoid the cmpxchg() loop by simply incrementing the
+ * lock counter. When held in exclusive state, the lock counter is set
+ * to INT_MIN so these increments won't hurt as the value will remain
+ * negative.
+ */
+ if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0)
+ return;
+
+ do {
+ val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0);
+ } while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val);
+}
+
+static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq)
+{
+ (void)atomic_dec_return_release(&cmdq->lock);
+}
+
+static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq)
+{
+ if (atomic_read(&cmdq->lock) == 1)
+ return false;
+
+ arm_smmu_cmdq_shared_unlock(cmdq);
+ return true;
+}
+
+#define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags) \
+({ \
+ bool __ret; \
+ local_irq_save(flags); \
+ __ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN); \
+ if (!__ret) \
+ local_irq_restore(flags); \
+ __ret; \
+})
+
+#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags) \
+({ \
+ atomic_set_release(&cmdq->lock, 0); \
+ local_irq_restore(flags); \
+})
+
+
+/*
+ * Command queue insertion.
+ * This is made fiddly by our attempts to achieve some sort of scalability
+ * since there is one queue shared amongst all of the CPUs in the system. If
+ * you like mixed-size concurrency, dependency ordering and relaxed atomics,
+ * then you'll *love* this monstrosity.
+ *
+ * The basic idea is to split the queue up into ranges of commands that are
+ * owned by a given CPU; the owner may not have written all of the commands
+ * itself, but is responsible for advancing the hardware prod pointer when
+ * the time comes. The algorithm is roughly:
+ *
+ * 1. Allocate some space in the queue. At this point we also discover
+ * whether the head of the queue is currently owned by another CPU,
+ * or whether we are the owner.
+ *
+ * 2. Write our commands into our allocated slots in the queue.
+ *
+ * 3. Mark our slots as valid in arm_smmu_cmdq.valid_map.
+ *
+ * 4. If we are an owner:
+ * a. Wait for the previous owner to finish.
+ * b. Mark the queue head as unowned, which tells us the range
+ * that we are responsible for publishing.
+ * c. Wait for all commands in our owned range to become valid.
+ * d. Advance the hardware prod pointer.
+ * e. Tell the next owner we've finished.
+ *
+ * 5. If we are inserting a CMD_SYNC (we may or may not have been an
+ * owner), then we need to stick around until it has completed:
+ * a. If we have MSIs, the SMMU can write back into the CMD_SYNC
+ * to clear the first 4 bytes.
+ * b. Otherwise, we spin waiting for the hardware cons pointer to
+ * advance past our command.
+ *
+ * The devil is in the details, particularly the use of locking for handling
+ * SYNC completion and freeing up space in the queue before we think that it is
+ * full.
+ */
+static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq,
+ u32 sprod, u32 eprod, bool set)
+{
+ u32 swidx, sbidx, ewidx, ebidx;
+ struct arm_smmu_ll_queue llq = {
+ .max_n_shift = cmdq->q.llq.max_n_shift,
+ .prod = sprod,
+ };
- while (queue_insert_raw(q, cmd) == -ENOSPC) {
- if (queue_poll_cons(q, false, wfe))
- dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
+ ewidx = BIT_WORD(Q_IDX(&llq, eprod));
+ ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG;
+
+ while (llq.prod != eprod) {
+ unsigned long mask;
+ atomic_long_t *ptr;
+ u32 limit = BITS_PER_LONG;
+
+ swidx = BIT_WORD(Q_IDX(&llq, llq.prod));
+ sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG;
+
+ ptr = &cmdq->valid_map[swidx];
+
+ if ((swidx == ewidx) && (sbidx < ebidx))
+ limit = ebidx;
+
+ mask = GENMASK(limit - 1, sbidx);
+
+ /*
+ * The valid bit is the inverse of the wrap bit. This means
+ * that a zero-initialised queue is invalid and, after marking
+ * all entries as valid, they become invalid again when we
+ * wrap.
+ */
+ if (set) {
+ atomic_long_xor(mask, ptr);
+ } else { /* Poll */
+ unsigned long valid;
+
+ valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask;
+ atomic_long_cond_read_relaxed(ptr, (VAL & mask) == valid);
+ }
+
+ llq.prod = queue_inc_prod_n(&llq, limit - sbidx);
}
}
-static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
- struct arm_smmu_cmdq_ent *ent)
+/* Mark all entries in the range [sprod, eprod) as valid */
+static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq,
+ u32 sprod, u32 eprod)
+{
+ __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true);
+}
+
+/* Wait for all entries in the range [sprod, eprod) to become valid */
+static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq,
+ u32 sprod, u32 eprod)
+{
+ __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false);
+}
+
+/* Wait for the command queue to become non-full */
+static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
+ struct arm_smmu_ll_queue *llq)
{
- u64 cmd[CMDQ_ENT_DWORDS];
unsigned long flags;
+ struct arm_smmu_queue_poll qp;
+ struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ int ret = 0;
- if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
- dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
- ent->opcode);
- return;
+ /*
+ * Try to update our copy of cons by grabbing exclusive cmdq access. If
+ * that fails, spin until somebody else updates it for us.
+ */
+ if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) {
+ WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg));
+ arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags);
+ llq->val = READ_ONCE(cmdq->q.llq.val);
+ return 0;
}
- spin_lock_irqsave(&smmu->cmdq.lock, flags);
- arm_smmu_cmdq_insert_cmd(smmu, cmd);
- spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
+ queue_poll_init(smmu, &qp);
+ do {
+ llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
+ if (!queue_full(llq))
+ break;
+
+ ret = queue_poll(&qp);
+ } while (!ret);
+
+ return ret;
}
/*
- * The difference between val and sync_idx is bounded by the maximum size of
- * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
+ * Wait until the SMMU signals a CMD_SYNC completion MSI.
+ * Must be called with the cmdq lock held in some capacity.
*/
-static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
+static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
+ struct arm_smmu_ll_queue *llq)
{
- ktime_t timeout;
- u32 val;
+ int ret = 0;
+ struct arm_smmu_queue_poll qp;
+ struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
- timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
- val = smp_cond_load_acquire(&smmu->sync_count,
- (int)(VAL - sync_idx) >= 0 ||
- !ktime_before(ktime_get(), timeout));
+ queue_poll_init(smmu, &qp);
- return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
+ /*
+ * The MSI won't generate an event, since it's being written back
+ * into the command queue.
+ */
+ qp.wfe = false;
+ smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp)));
+ llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1);
+ return ret;
}
-static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
+/*
+ * Wait until the SMMU cons index passes llq->prod.
+ * Must be called with the cmdq lock held in some capacity.
+ */
+static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
+ struct arm_smmu_ll_queue *llq)
{
- u64 cmd[CMDQ_ENT_DWORDS];
- unsigned long flags;
- struct arm_smmu_cmdq_ent ent = {
- .opcode = CMDQ_OP_CMD_SYNC,
- .sync = {
- .msiaddr = virt_to_phys(&smmu->sync_count),
- },
- };
+ struct arm_smmu_queue_poll qp;
+ struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ u32 prod = llq->prod;
+ int ret = 0;
- spin_lock_irqsave(&smmu->cmdq.lock, flags);
+ queue_poll_init(smmu, &qp);
+ llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
+ do {
+ if (queue_consumed(llq, prod))
+ break;
- /* Piggy-back on the previous command if it's a SYNC */
- if (smmu->prev_cmd_opcode == CMDQ_OP_CMD_SYNC) {
- ent.sync.msidata = smmu->sync_nr;
- } else {
- ent.sync.msidata = ++smmu->sync_nr;
- arm_smmu_cmdq_build_cmd(cmd, &ent);
- arm_smmu_cmdq_insert_cmd(smmu, cmd);
- }
+ ret = queue_poll(&qp);
- spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
+ /*
+ * This needs to be a readl() so that our subsequent call
+ * to arm_smmu_cmdq_shared_tryunlock() can fail accurately.
+ *
+ * Specifically, we need to ensure that we observe all
+ * shared_lock()s by other CMD_SYNCs that share our owner,
+ * so that a failing call to tryunlock() means that we're
+ * the last one out and therefore we can safely advance
+ * cmdq->q.llq.cons. Roughly speaking:
+ *
+ * CPU 0 CPU1 CPU2 (us)
+ *
+ * if (sync)
+ * shared_lock();
+ *
+ * dma_wmb();
+ * set_valid_map();
+ *
+ * if (owner) {
+ * poll_valid_map();
+ * <control dependency>
+ * writel(prod_reg);
+ *
+ * readl(cons_reg);
+ * tryunlock();
+ *
+ * Requires us to see CPU 0's shared_lock() acquisition.
+ */
+ llq->cons = readl(cmdq->q.cons_reg);
+ } while (!ret);
- return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
+ return ret;
}
-static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
+static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
+ struct arm_smmu_ll_queue *llq)
{
- u64 cmd[CMDQ_ENT_DWORDS];
+ if (smmu->features & ARM_SMMU_FEAT_MSI &&
+ smmu->features & ARM_SMMU_FEAT_COHERENCY)
+ return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
+
+ return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
+}
+
+static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
+ u32 prod, int n)
+{
+ int i;
+ struct arm_smmu_ll_queue llq = {
+ .max_n_shift = cmdq->q.llq.max_n_shift,
+ .prod = prod,
+ };
+
+ for (i = 0; i < n; ++i) {
+ u64 *cmd = &cmds[i * CMDQ_ENT_DWORDS];
+
+ prod = queue_inc_prod_n(&llq, i);
+ queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS);
+ }
+}
+
+static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
+ u64 *cmds, int n, bool sync)
+{
+ u64 cmd_sync[CMDQ_ENT_DWORDS];
+ u32 prod;
unsigned long flags;
- bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
- struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
- int ret;
+ bool owner;
+ struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ struct arm_smmu_ll_queue llq = {
+ .max_n_shift = cmdq->q.llq.max_n_shift,
+ }, head = llq;
+ int ret = 0;
- arm_smmu_cmdq_build_cmd(cmd, &ent);
+ /* 1. Allocate some space in the queue */
+ local_irq_save(flags);
+ llq.val = READ_ONCE(cmdq->q.llq.val);
+ do {
+ u64 old;
+
+ while (!queue_has_space(&llq, n + sync)) {
+ local_irq_restore(flags);
+ if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
+ dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
+ local_irq_save(flags);
+ }
+
+ head.cons = llq.cons;
+ head.prod = queue_inc_prod_n(&llq, n + sync) |
+ CMDQ_PROD_OWNED_FLAG;
+
+ old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
+ if (old == llq.val)
+ break;
+
+ llq.val = old;
+ } while (1);
+ owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
+ head.prod &= ~CMDQ_PROD_OWNED_FLAG;
+ llq.prod &= ~CMDQ_PROD_OWNED_FLAG;
+
+ /*
+ * 2. Write our commands into the queue
+ * Dependency ordering from the cmpxchg() loop above.
+ */
+ arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
+ if (sync) {
+ prod = queue_inc_prod_n(&llq, n);
+ arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod);
+ queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);
+
+ /*
+ * In order to determine completion of our CMD_SYNC, we must
+ * ensure that the queue can't wrap twice without us noticing.
+ * We achieve that by taking the cmdq lock as shared before
+ * marking our slot as valid.
+ */
+ arm_smmu_cmdq_shared_lock(cmdq);
+ }
- spin_lock_irqsave(&smmu->cmdq.lock, flags);
- arm_smmu_cmdq_insert_cmd(smmu, cmd);
- ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
- spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
+ /* 3. Mark our slots as valid, ensuring commands are visible first */
+ dma_wmb();
+ arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod);
+ /* 4. If we are the owner, take control of the SMMU hardware */
+ if (owner) {
+ /* a. Wait for previous owner to finish */
+ atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod);
+
+ /* b. Stop gathering work by clearing the owned flag */
+ prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG,
+ &cmdq->q.llq.atomic.prod);
+ prod &= ~CMDQ_PROD_OWNED_FLAG;
+
+ /*
+ * c. Wait for any gathered work to be written to the queue.
+ * Note that we read our own entries so that we have the control
+ * dependency required by (d).
+ */
+ arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod);
+
+ /*
+ * d. Advance the hardware prod pointer
+ * Control dependency ordering from the entries becoming valid.
+ */
+ writel_relaxed(prod, cmdq->q.prod_reg);
+
+ /*
+ * e. Tell the next owner we're done
+ * Make sure we've updated the hardware first, so that we don't
+ * race to update prod and potentially move it backwards.
+ */
+ atomic_set_release(&cmdq->owner_prod, prod);
+ }
+
+ /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
+ if (sync) {
+ llq.prod = queue_inc_prod_n(&llq, n);
+ ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq);
+ if (ret) {
+ dev_err_ratelimited(smmu->dev,
+ "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
+ llq.prod,
+ readl_relaxed(cmdq->q.prod_reg),
+ readl_relaxed(cmdq->q.cons_reg));
+ }
+
+ /*
+ * Try to unlock the cmq lock. This will fail if we're the last
+ * reader, in which case we can safely update cmdq->q.llq.cons
+ */
+ if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) {
+ WRITE_ONCE(cmdq->q.llq.cons, llq.cons);
+ arm_smmu_cmdq_shared_unlock(cmdq);
+ }
+ }
+
+ local_irq_restore(flags);
return ret;
}
-static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
+static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq_ent *ent)
{
- int ret;
- bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
- (smmu->features & ARM_SMMU_FEAT_COHERENCY);
+ u64 cmd[CMDQ_ENT_DWORDS];
- ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
- : __arm_smmu_cmdq_issue_sync(smmu);
- if (ret)
- dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
- return ret;
+ if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
+ dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
+ ent->opcode);
+ return -EINVAL;
+ }
+
+ return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
+}
+
+static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
+{
+ return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
}
/* Context descriptor manipulation functions */
@@ -1580,9 +1932,9 @@ static void arm_smmu_tlb_inv_context(void *cookie)
/*
* NOTE: when io-pgtable is in non-strict mode, we may get here with
* PTEs previously cleared by unmaps on the current CPU not yet visible
- * to the SMMU. We are relying on the DSB implicit in
- * queue_sync_prod_out() to guarantee those are observed before the
- * TLBI. Do be careful, 007.
+ * to the SMMU. We are relying on the dma_wmb() implicit during cmd
+ * insertion to guarantee those are observed before the TLBI. Do be
+ * careful, 007.
*/
arm_smmu_cmdq_issue_cmd(smmu, &cmd);
arm_smmu_cmdq_issue_sync(smmu);
@@ -2359,18 +2711,49 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
return 0;
}
+static void arm_smmu_cmdq_free_bitmap(void *data)
+{
+ unsigned long *bitmap = data;
+ bitmap_free(bitmap);
+}
+
+static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
+{
+ int ret = 0;
+ struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
+ atomic_long_t *bitmap;
+
+ atomic_set(&cmdq->owner_prod, 0);
+ atomic_set(&cmdq->lock, 0);
+
+ bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
+ if (!bitmap) {
+ dev_err(smmu->dev, "failed to allocate cmdq bitmap\n");
+ ret = -ENOMEM;
+ } else {
+ cmdq->valid_map = bitmap;
+ devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap);
+ }
+
+ return ret;
+}
+
static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
{
int ret;
/* cmdq */
- spin_lock_init(&smmu->cmdq.lock);
ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD,
ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS,
"cmdq");
if (ret)
return ret;
+ ret = arm_smmu_cmdq_init(smmu);
+ if (ret)
+ return ret;
+
/* evtq */
ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS,
@@ -2951,9 +3334,15 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
/* Queue sizes, capped to ensure natural alignment */
smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT,
FIELD_GET(IDR1_CMDQS, reg));
- if (!smmu->cmdq.q.llq.max_n_shift) {
- /* Odd alignment restrictions on the base, so ignore for now */
- dev_err(smmu->dev, "unit-length command queue not supported\n");
+ if (smmu->cmdq.q.llq.max_n_shift < ilog2(BITS_PER_LONG)) {
+ /*
+ * The cmdq valid_map relies on the total number of entries
+ * being a multiple of BITS_PER_LONG. There's also no way
+ * we can handle the weird alignment restrictions on the
+ * base pointer for a unit-length queue.
+ */
+ dev_err(smmu->dev, "command queue size < %d entries not supported\n",
+ BITS_PER_LONG);
return -ENXIO;
}