summaryrefslogtreecommitdiffstats
path: root/drivers/iommu/arm-smmu-v3.c
diff options
context:
space:
mode:
authorJoerg Roedel <jroedel@suse.de>2019-08-23 17:41:59 +0200
committerJoerg Roedel <jroedel@suse.de>2019-08-23 17:41:59 +0200
commitdbe8e6a81a2a45dfa210c2c271472db80efaa03f (patch)
treed8c11ee8159d5730fdd419dc0d463b80757bc314 /drivers/iommu/arm-smmu-v3.c
parentLinux 5.3-rc5 (diff)
parentMerge branches 'for-joerg/arm-smmu/smmu-v2' and 'for-joerg/arm-smmu/smmu-v3' ... (diff)
downloadlinux-dbe8e6a81a2a45dfa210c2c271472db80efaa03f.tar.xz
linux-dbe8e6a81a2a45dfa210c2c271472db80efaa03f.zip
Merge branch 'for-joerg/arm-smmu/updates' of git://git.kernel.org/pub/scm/linux/kernel/git/will/linux into arm/smmu
Diffstat (limited to 'drivers/iommu/arm-smmu-v3.c')
-rw-r--r--drivers/iommu/arm-smmu-v3.c973
1 files changed, 736 insertions, 237 deletions
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index c5c93e48b4db..29b773e186c8 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -181,12 +181,13 @@
#define ARM_SMMU_MEMATTR_DEVICE_nGnRE 0x1
#define ARM_SMMU_MEMATTR_OIWB 0xf
-#define Q_IDX(q, p) ((p) & ((1 << (q)->max_n_shift) - 1))
-#define Q_WRP(q, p) ((p) & (1 << (q)->max_n_shift))
-#define Q_OVERFLOW_FLAG (1 << 31)
-#define Q_OVF(q, p) ((p) & Q_OVERFLOW_FLAG)
+#define Q_IDX(llq, p) ((p) & ((1 << (llq)->max_n_shift) - 1))
+#define Q_WRP(llq, p) ((p) & (1 << (llq)->max_n_shift))
+#define Q_OVERFLOW_FLAG (1U << 31)
+#define Q_OVF(p) ((p) & Q_OVERFLOW_FLAG)
#define Q_ENT(q, p) ((q)->base + \
- Q_IDX(q, p) * (q)->ent_dwords)
+ Q_IDX(&((q)->llq), p) * \
+ (q)->ent_dwords)
#define Q_BASE_RWA (1UL << 62)
#define Q_BASE_ADDR_MASK GENMASK_ULL(51, 5)
@@ -306,6 +307,15 @@
#define CMDQ_ERR_CERROR_ABT_IDX 2
#define CMDQ_ERR_CERROR_ATC_INV_IDX 3
+#define CMDQ_PROD_OWNED_FLAG Q_OVERFLOW_FLAG
+
+/*
+ * This is used to size the command queue and therefore must be at least
+ * BITS_PER_LONG so that the valid_map works correctly (it relies on the
+ * total number of queue entries being a multiple of BITS_PER_LONG).
+ */
+#define CMDQ_BATCH_ENTRIES BITS_PER_LONG
+
#define CMDQ_0_OP GENMASK_ULL(7, 0)
#define CMDQ_0_SSV (1UL << 11)
@@ -368,9 +378,8 @@
#define PRIQ_1_ADDR_MASK GENMASK_ULL(63, 12)
/* High-level queue structures */
-#define ARM_SMMU_POLL_TIMEOUT_US 100
-#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US 1000000 /* 1s! */
-#define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT 10
+#define ARM_SMMU_POLL_TIMEOUT_US 1000000 /* 1s! */
+#define ARM_SMMU_POLL_SPIN_COUNT 10
#define MSI_IOVA_BASE 0x8000000
#define MSI_IOVA_LENGTH 0x100000
@@ -472,13 +481,29 @@ struct arm_smmu_cmdq_ent {
#define CMDQ_OP_CMD_SYNC 0x46
struct {
- u32 msidata;
u64 msiaddr;
} sync;
};
};
+struct arm_smmu_ll_queue {
+ union {
+ u64 val;
+ struct {
+ u32 prod;
+ u32 cons;
+ };
+ struct {
+ atomic_t prod;
+ atomic_t cons;
+ } atomic;
+ u8 __pad[SMP_CACHE_BYTES];
+ } ____cacheline_aligned_in_smp;
+ u32 max_n_shift;
+};
+
struct arm_smmu_queue {
+ struct arm_smmu_ll_queue llq;
int irq; /* Wired interrupt */
__le64 *base;
@@ -486,17 +511,23 @@ struct arm_smmu_queue {
u64 q_base;
size_t ent_dwords;
- u32 max_n_shift;
- u32 prod;
- u32 cons;
u32 __iomem *prod_reg;
u32 __iomem *cons_reg;
};
+struct arm_smmu_queue_poll {
+ ktime_t timeout;
+ unsigned int delay;
+ unsigned int spin_cnt;
+ bool wfe;
+};
+
struct arm_smmu_cmdq {
struct arm_smmu_queue q;
- spinlock_t lock;
+ atomic_long_t *valid_map;
+ atomic_t owner_prod;
+ atomic_t lock;
};
struct arm_smmu_evtq {
@@ -576,8 +607,6 @@ struct arm_smmu_device {
int gerr_irq;
int combined_irq;
- u32 sync_nr;
- u8 prev_cmd_opcode;
unsigned long ias; /* IPA */
unsigned long oas; /* PA */
@@ -596,12 +625,6 @@ struct arm_smmu_device {
struct arm_smmu_strtab_cfg strtab_cfg;
- /* Hi16xx adds an extra 32 bits of goodness to its MSI payload */
- union {
- u32 sync_count;
- u64 padding;
- };
-
/* IOMMU core code handle */
struct iommu_device iommu;
};
@@ -614,7 +637,7 @@ struct arm_smmu_master {
struct list_head domain_head;
u32 *sids;
unsigned int num_sids;
- bool ats_enabled :1;
+ bool ats_enabled;
};
/* SMMU private data for an IOMMU domain */
@@ -631,6 +654,7 @@ struct arm_smmu_domain {
struct io_pgtable_ops *pgtbl_ops;
bool non_strict;
+ atomic_t nr_ats_masters;
enum arm_smmu_domain_stage stage;
union {
@@ -685,85 +709,97 @@ static void parse_driver_options(struct arm_smmu_device *smmu)
}
/* Low-level queue manipulation functions */
-static bool queue_full(struct arm_smmu_queue *q)
+static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n)
+{
+ u32 space, prod, cons;
+
+ prod = Q_IDX(q, q->prod);
+ cons = Q_IDX(q, q->cons);
+
+ if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons))
+ space = (1 << q->max_n_shift) - (prod - cons);
+ else
+ space = cons - prod;
+
+ return space >= n;
+}
+
+static bool queue_full(struct arm_smmu_ll_queue *q)
{
return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
Q_WRP(q, q->prod) != Q_WRP(q, q->cons);
}
-static bool queue_empty(struct arm_smmu_queue *q)
+static bool queue_empty(struct arm_smmu_ll_queue *q)
{
return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
}
-static void queue_sync_cons(struct arm_smmu_queue *q)
+static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod)
{
- q->cons = readl_relaxed(q->cons_reg);
+ return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) &&
+ (Q_IDX(q, q->cons) > Q_IDX(q, prod))) ||
+ ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) &&
+ (Q_IDX(q, q->cons) <= Q_IDX(q, prod)));
}
-static void queue_inc_cons(struct arm_smmu_queue *q)
+static void queue_sync_cons_out(struct arm_smmu_queue *q)
{
- u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1;
-
- q->cons = Q_OVF(q, q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons);
-
/*
* Ensure that all CPU accesses (reads and writes) to the queue
* are complete before we update the cons pointer.
*/
mb();
- writel_relaxed(q->cons, q->cons_reg);
+ writel_relaxed(q->llq.cons, q->cons_reg);
+}
+
+static void queue_inc_cons(struct arm_smmu_ll_queue *q)
+{
+ u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1;
+ q->cons = Q_OVF(q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons);
}
-static int queue_sync_prod(struct arm_smmu_queue *q)
+static int queue_sync_prod_in(struct arm_smmu_queue *q)
{
int ret = 0;
u32 prod = readl_relaxed(q->prod_reg);
- if (Q_OVF(q, prod) != Q_OVF(q, q->prod))
+ if (Q_OVF(prod) != Q_OVF(q->llq.prod))
ret = -EOVERFLOW;
- q->prod = prod;
+ q->llq.prod = prod;
return ret;
}
-static void queue_inc_prod(struct arm_smmu_queue *q)
+static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n)
{
- u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
-
- q->prod = Q_OVF(q, q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
- writel(q->prod, q->prod_reg);
+ u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n;
+ return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
}
-/*
- * Wait for the SMMU to consume items. If sync is true, wait until the queue
- * is empty. Otherwise, wait until there is at least one free slot.
- */
-static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
+static void queue_poll_init(struct arm_smmu_device *smmu,
+ struct arm_smmu_queue_poll *qp)
{
- ktime_t timeout;
- unsigned int delay = 1, spin_cnt = 0;
-
- /* Wait longer if it's a CMD_SYNC */
- timeout = ktime_add_us(ktime_get(), sync ?
- ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
- ARM_SMMU_POLL_TIMEOUT_US);
+ qp->delay = 1;
+ qp->spin_cnt = 0;
+ qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+ qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
+}
- while (queue_sync_cons(q), (sync ? !queue_empty(q) : queue_full(q))) {
- if (ktime_compare(ktime_get(), timeout) > 0)
- return -ETIMEDOUT;
+static int queue_poll(struct arm_smmu_queue_poll *qp)
+{
+ if (ktime_compare(ktime_get(), qp->timeout) > 0)
+ return -ETIMEDOUT;
- if (wfe) {
- wfe();
- } else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
- cpu_relax();
- continue;
- } else {
- udelay(delay);
- delay *= 2;
- spin_cnt = 0;
- }
+ if (qp->wfe) {
+ wfe();
+ } else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) {
+ cpu_relax();
+ } else {
+ udelay(qp->delay);
+ qp->delay *= 2;
+ qp->spin_cnt = 0;
}
return 0;
@@ -777,16 +813,6 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
*dst++ = cpu_to_le64(*src++);
}
-static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
-{
- if (queue_full(q))
- return -ENOSPC;
-
- queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords);
- queue_inc_prod(q);
- return 0;
-}
-
static void queue_read(__le64 *dst, u64 *src, size_t n_dwords)
{
int i;
@@ -797,11 +823,12 @@ static void queue_read(__le64 *dst, u64 *src, size_t n_dwords)
static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
{
- if (queue_empty(q))
+ if (queue_empty(&q->llq))
return -EAGAIN;
- queue_read(ent, Q_ENT(q, q->cons), q->ent_dwords);
- queue_inc_cons(q);
+ queue_read(ent, Q_ENT(q, q->llq.cons), q->ent_dwords);
+ queue_inc_cons(&q->llq);
+ queue_sync_cons_out(q);
return 0;
}
@@ -868,20 +895,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
break;
case CMDQ_OP_CMD_SYNC:
- if (ent->sync.msiaddr)
+ if (ent->sync.msiaddr) {
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ);
- else
+ cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
+ } else {
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
+ }
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH);
cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB);
- /*
- * Commands are written little-endian, but we want the SMMU to
- * receive MSIData, and thus write it back to memory, in CPU
- * byte order, so big-endian needs an extra byteswap here.
- */
- cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA,
- cpu_to_le32(ent->sync.msidata));
- cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
break;
default:
return -ENOENT;
@@ -890,6 +911,27 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
return 0;
}
+static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
+ u32 prod)
+{
+ struct arm_smmu_queue *q = &smmu->cmdq.q;
+ struct arm_smmu_cmdq_ent ent = {
+ .opcode = CMDQ_OP_CMD_SYNC,
+ };
+
+ /*
+ * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
+ * payload, so the write will zero the entire command on that platform.
+ */
+ if (smmu->features & ARM_SMMU_FEAT_MSI &&
+ smmu->features & ARM_SMMU_FEAT_COHERENCY) {
+ ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
+ q->ent_dwords * 8;
+ }
+
+ arm_smmu_cmdq_build_cmd(cmd, &ent);
+}
+
static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
{
static const char *cerror_str[] = {
@@ -948,109 +990,456 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
}
-static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
+/*
+ * Command queue locking.
+ * This is a form of bastardised rwlock with the following major changes:
+ *
+ * - The only LOCK routines are exclusive_trylock() and shared_lock().
+ * Neither have barrier semantics, and instead provide only a control
+ * dependency.
+ *
+ * - The UNLOCK routines are supplemented with shared_tryunlock(), which
+ * fails if the caller appears to be the last lock holder (yes, this is
+ * racy). All successful UNLOCK routines have RELEASE semantics.
+ */
+static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq)
{
- struct arm_smmu_queue *q = &smmu->cmdq.q;
- bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+ int val;
+
+ /*
+ * We can try to avoid the cmpxchg() loop by simply incrementing the
+ * lock counter. When held in exclusive state, the lock counter is set
+ * to INT_MIN so these increments won't hurt as the value will remain
+ * negative.
+ */
+ if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0)
+ return;
+
+ do {
+ val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0);
+ } while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val);
+}
+
+static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq)
+{
+ (void)atomic_dec_return_release(&cmdq->lock);
+}
+
+static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq)
+{
+ if (atomic_read(&cmdq->lock) == 1)
+ return false;
+
+ arm_smmu_cmdq_shared_unlock(cmdq);
+ return true;
+}
+
+#define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags) \
+({ \
+ bool __ret; \
+ local_irq_save(flags); \
+ __ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN); \
+ if (!__ret) \
+ local_irq_restore(flags); \
+ __ret; \
+})
+
+#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags) \
+({ \
+ atomic_set_release(&cmdq->lock, 0); \
+ local_irq_restore(flags); \
+})
+
+
+/*
+ * Command queue insertion.
+ * This is made fiddly by our attempts to achieve some sort of scalability
+ * since there is one queue shared amongst all of the CPUs in the system. If
+ * you like mixed-size concurrency, dependency ordering and relaxed atomics,
+ * then you'll *love* this monstrosity.
+ *
+ * The basic idea is to split the queue up into ranges of commands that are
+ * owned by a given CPU; the owner may not have written all of the commands
+ * itself, but is responsible for advancing the hardware prod pointer when
+ * the time comes. The algorithm is roughly:
+ *
+ * 1. Allocate some space in the queue. At this point we also discover
+ * whether the head of the queue is currently owned by another CPU,
+ * or whether we are the owner.
+ *
+ * 2. Write our commands into our allocated slots in the queue.
+ *
+ * 3. Mark our slots as valid in arm_smmu_cmdq.valid_map.
+ *
+ * 4. If we are an owner:
+ * a. Wait for the previous owner to finish.
+ * b. Mark the queue head as unowned, which tells us the range
+ * that we are responsible for publishing.
+ * c. Wait for all commands in our owned range to become valid.
+ * d. Advance the hardware prod pointer.
+ * e. Tell the next owner we've finished.
+ *
+ * 5. If we are inserting a CMD_SYNC (we may or may not have been an
+ * owner), then we need to stick around until it has completed:
+ * a. If we have MSIs, the SMMU can write back into the CMD_SYNC
+ * to clear the first 4 bytes.
+ * b. Otherwise, we spin waiting for the hardware cons pointer to
+ * advance past our command.
+ *
+ * The devil is in the details, particularly the use of locking for handling
+ * SYNC completion and freeing up space in the queue before we think that it is
+ * full.
+ */
+static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq,
+ u32 sprod, u32 eprod, bool set)
+{
+ u32 swidx, sbidx, ewidx, ebidx;
+ struct arm_smmu_ll_queue llq = {
+ .max_n_shift = cmdq->q.llq.max_n_shift,
+ .prod = sprod,
+ };
+
+ ewidx = BIT_WORD(Q_IDX(&llq, eprod));
+ ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG;
- smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]);
+ while (llq.prod != eprod) {
+ unsigned long mask;
+ atomic_long_t *ptr;
+ u32 limit = BITS_PER_LONG;
- while (queue_insert_raw(q, cmd) == -ENOSPC) {
- if (queue_poll_cons(q, false, wfe))
- dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
+ swidx = BIT_WORD(Q_IDX(&llq, llq.prod));
+ sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG;
+
+ ptr = &cmdq->valid_map[swidx];
+
+ if ((swidx == ewidx) && (sbidx < ebidx))
+ limit = ebidx;
+
+ mask = GENMASK(limit - 1, sbidx);
+
+ /*
+ * The valid bit is the inverse of the wrap bit. This means
+ * that a zero-initialised queue is invalid and, after marking
+ * all entries as valid, they become invalid again when we
+ * wrap.
+ */
+ if (set) {
+ atomic_long_xor(mask, ptr);
+ } else { /* Poll */
+ unsigned long valid;
+
+ valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask;
+ atomic_long_cond_read_relaxed(ptr, (VAL & mask) == valid);
+ }
+
+ llq.prod = queue_inc_prod_n(&llq, limit - sbidx);
}
}
-static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
- struct arm_smmu_cmdq_ent *ent)
+/* Mark all entries in the range [sprod, eprod) as valid */
+static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq,
+ u32 sprod, u32 eprod)
+{
+ __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true);
+}
+
+/* Wait for all entries in the range [sprod, eprod) to become valid */
+static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq,
+ u32 sprod, u32 eprod)
+{
+ __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false);
+}
+
+/* Wait for the command queue to become non-full */
+static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
+ struct arm_smmu_ll_queue *llq)
{
- u64 cmd[CMDQ_ENT_DWORDS];
unsigned long flags;
+ struct arm_smmu_queue_poll qp;
+ struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ int ret = 0;
- if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
- dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
- ent->opcode);
- return;
+ /*
+ * Try to update our copy of cons by grabbing exclusive cmdq access. If
+ * that fails, spin until somebody else updates it for us.
+ */
+ if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) {
+ WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg));
+ arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags);
+ llq->val = READ_ONCE(cmdq->q.llq.val);
+ return 0;
}
- spin_lock_irqsave(&smmu->cmdq.lock, flags);
- arm_smmu_cmdq_insert_cmd(smmu, cmd);
- spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
+ queue_poll_init(smmu, &qp);
+ do {
+ llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
+ if (!queue_full(llq))
+ break;
+
+ ret = queue_poll(&qp);
+ } while (!ret);
+
+ return ret;
}
/*
- * The difference between val and sync_idx is bounded by the maximum size of
- * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
+ * Wait until the SMMU signals a CMD_SYNC completion MSI.
+ * Must be called with the cmdq lock held in some capacity.
*/
-static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
+static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
+ struct arm_smmu_ll_queue *llq)
{
- ktime_t timeout;
- u32 val;
+ int ret = 0;
+ struct arm_smmu_queue_poll qp;
+ struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
- timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
- val = smp_cond_load_acquire(&smmu->sync_count,
- (int)(VAL - sync_idx) >= 0 ||
- !ktime_before(ktime_get(), timeout));
+ queue_poll_init(smmu, &qp);
- return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
+ /*
+ * The MSI won't generate an event, since it's being written back
+ * into the command queue.
+ */
+ qp.wfe = false;
+ smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp)));
+ llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1);
+ return ret;
}
-static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
+/*
+ * Wait until the SMMU cons index passes llq->prod.
+ * Must be called with the cmdq lock held in some capacity.
+ */
+static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
+ struct arm_smmu_ll_queue *llq)
{
- u64 cmd[CMDQ_ENT_DWORDS];
- unsigned long flags;
- struct arm_smmu_cmdq_ent ent = {
- .opcode = CMDQ_OP_CMD_SYNC,
- .sync = {
- .msiaddr = virt_to_phys(&smmu->sync_count),
- },
- };
+ struct arm_smmu_queue_poll qp;
+ struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ u32 prod = llq->prod;
+ int ret = 0;
- spin_lock_irqsave(&smmu->cmdq.lock, flags);
+ queue_poll_init(smmu, &qp);
+ llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
+ do {
+ if (queue_consumed(llq, prod))
+ break;
- /* Piggy-back on the previous command if it's a SYNC */
- if (smmu->prev_cmd_opcode == CMDQ_OP_CMD_SYNC) {
- ent.sync.msidata = smmu->sync_nr;
- } else {
- ent.sync.msidata = ++smmu->sync_nr;
- arm_smmu_cmdq_build_cmd(cmd, &ent);
- arm_smmu_cmdq_insert_cmd(smmu, cmd);
- }
+ ret = queue_poll(&qp);
- spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
+ /*
+ * This needs to be a readl() so that our subsequent call
+ * to arm_smmu_cmdq_shared_tryunlock() can fail accurately.
+ *
+ * Specifically, we need to ensure that we observe all
+ * shared_lock()s by other CMD_SYNCs that share our owner,
+ * so that a failing call to tryunlock() means that we're
+ * the last one out and therefore we can safely advance
+ * cmdq->q.llq.cons. Roughly speaking:
+ *
+ * CPU 0 CPU1 CPU2 (us)
+ *
+ * if (sync)
+ * shared_lock();
+ *
+ * dma_wmb();
+ * set_valid_map();
+ *
+ * if (owner) {
+ * poll_valid_map();
+ * <control dependency>
+ * writel(prod_reg);
+ *
+ * readl(cons_reg);
+ * tryunlock();
+ *
+ * Requires us to see CPU 0's shared_lock() acquisition.
+ */
+ llq->cons = readl(cmdq->q.cons_reg);
+ } while (!ret);
- return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
+ return ret;
}
-static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
+static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
+ struct arm_smmu_ll_queue *llq)
{
- u64 cmd[CMDQ_ENT_DWORDS];
+ if (smmu->features & ARM_SMMU_FEAT_MSI &&
+ smmu->features & ARM_SMMU_FEAT_COHERENCY)
+ return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
+
+ return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
+}
+
+static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
+ u32 prod, int n)
+{
+ int i;
+ struct arm_smmu_ll_queue llq = {
+ .max_n_shift = cmdq->q.llq.max_n_shift,
+ .prod = prod,
+ };
+
+ for (i = 0; i < n; ++i) {
+ u64 *cmd = &cmds[i * CMDQ_ENT_DWORDS];
+
+ prod = queue_inc_prod_n(&llq, i);
+ queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS);
+ }
+}
+
+/*
+ * This is the actual insertion function, and provides the following
+ * ordering guarantees to callers:
+ *
+ * - There is a dma_wmb() before publishing any commands to the queue.
+ * This can be relied upon to order prior writes to data structures
+ * in memory (such as a CD or an STE) before the command.
+ *
+ * - On completion of a CMD_SYNC, there is a control dependency.
+ * This can be relied upon to order subsequent writes to memory (e.g.
+ * freeing an IOVA) after completion of the CMD_SYNC.
+ *
+ * - Command insertion is totally ordered, so if two CPUs each race to
+ * insert their own list of commands then all of the commands from one
+ * CPU will appear before any of the commands from the other CPU.
+ */
+static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
+ u64 *cmds, int n, bool sync)
+{
+ u64 cmd_sync[CMDQ_ENT_DWORDS];
+ u32 prod;
unsigned long flags;
- bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
- struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
- int ret;
+ bool owner;
+ struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ struct arm_smmu_ll_queue llq = {
+ .max_n_shift = cmdq->q.llq.max_n_shift,
+ }, head = llq;
+ int ret = 0;
- arm_smmu_cmdq_build_cmd(cmd, &ent);
+ /* 1. Allocate some space in the queue */
+ local_irq_save(flags);
+ llq.val = READ_ONCE(cmdq->q.llq.val);
+ do {
+ u64 old;
+
+ while (!queue_has_space(&llq, n + sync)) {
+ local_irq_restore(flags);
+ if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
+ dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
+ local_irq_save(flags);
+ }
+
+ head.cons = llq.cons;
+ head.prod = queue_inc_prod_n(&llq, n + sync) |
+ CMDQ_PROD_OWNED_FLAG;
+
+ old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
+ if (old == llq.val)
+ break;
+
+ llq.val = old;
+ } while (1);
+ owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
+ head.prod &= ~CMDQ_PROD_OWNED_FLAG;
+ llq.prod &= ~CMDQ_PROD_OWNED_FLAG;
+
+ /*
+ * 2. Write our commands into the queue
+ * Dependency ordering from the cmpxchg() loop above.
+ */
+ arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
+ if (sync) {
+ prod = queue_inc_prod_n(&llq, n);
+ arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod);
+ queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);
+
+ /*
+ * In order to determine completion of our CMD_SYNC, we must
+ * ensure that the queue can't wrap twice without us noticing.
+ * We achieve that by taking the cmdq lock as shared before
+ * marking our slot as valid.
+ */
+ arm_smmu_cmdq_shared_lock(cmdq);
+ }
+
+ /* 3. Mark our slots as valid, ensuring commands are visible first */
+ dma_wmb();
+ arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod);
+
+ /* 4. If we are the owner, take control of the SMMU hardware */
+ if (owner) {
+ /* a. Wait for previous owner to finish */
+ atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod);
+
+ /* b. Stop gathering work by clearing the owned flag */
+ prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG,
+ &cmdq->q.llq.atomic.prod);
+ prod &= ~CMDQ_PROD_OWNED_FLAG;
+
+ /*
+ * c. Wait for any gathered work to be written to the queue.
+ * Note that we read our own entries so that we have the control
+ * dependency required by (d).
+ */
+ arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod);
+
+ /*
+ * d. Advance the hardware prod pointer
+ * Control dependency ordering from the entries becoming valid.
+ */
+ writel_relaxed(prod, cmdq->q.prod_reg);
+
+ /*
+ * e. Tell the next owner we're done
+ * Make sure we've updated the hardware first, so that we don't
+ * race to update prod and potentially move it backwards.
+ */
+ atomic_set_release(&cmdq->owner_prod, prod);
+ }
+
+ /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
+ if (sync) {
+ llq.prod = queue_inc_prod_n(&llq, n);
+ ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq);
+ if (ret) {
+ dev_err_ratelimited(smmu->dev,
+ "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
+ llq.prod,
+ readl_relaxed(cmdq->q.prod_reg),
+ readl_relaxed(cmdq->q.cons_reg));
+ }
- spin_lock_irqsave(&smmu->cmdq.lock, flags);
- arm_smmu_cmdq_insert_cmd(smmu, cmd);
- ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
- spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
+ /*
+ * Try to unlock the cmq lock. This will fail if we're the last
+ * reader, in which case we can safely update cmdq->q.llq.cons
+ */
+ if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) {
+ WRITE_ONCE(cmdq->q.llq.cons, llq.cons);
+ arm_smmu_cmdq_shared_unlock(cmdq);
+ }
+ }
+ local_irq_restore(flags);
return ret;
}
-static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
+static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq_ent *ent)
{
- int ret;
- bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
- (smmu->features & ARM_SMMU_FEAT_COHERENCY);
+ u64 cmd[CMDQ_ENT_DWORDS];
- ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
- : __arm_smmu_cmdq_issue_sync(smmu);
- if (ret)
- dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
- return ret;
+ if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
+ dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
+ ent->opcode);
+ return -EINVAL;
+ }
+
+ return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
+}
+
+static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
+{
+ return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
}
/* Context descriptor manipulation functions */
@@ -1305,6 +1694,7 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
int i;
struct arm_smmu_device *smmu = dev;
struct arm_smmu_queue *q = &smmu->evtq.q;
+ struct arm_smmu_ll_queue *llq = &q->llq;
u64 evt[EVTQ_ENT_DWORDS];
do {
@@ -1322,12 +1712,13 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
* Not much we can do on overflow, so scream and pretend we're
* trying harder.
*/
- if (queue_sync_prod(q) == -EOVERFLOW)
+ if (queue_sync_prod_in(q) == -EOVERFLOW)
dev_err(smmu->dev, "EVTQ overflow detected -- events lost\n");
- } while (!queue_empty(q));
+ } while (!queue_empty(llq));
/* Sync our overflow flag, as we believe we're up to speed */
- q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons);
+ llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) |
+ Q_IDX(llq, llq->cons);
return IRQ_HANDLED;
}
@@ -1373,19 +1764,21 @@ static irqreturn_t arm_smmu_priq_thread(int irq, void *dev)
{
struct arm_smmu_device *smmu = dev;
struct arm_smmu_queue *q = &smmu->priq.q;
+ struct arm_smmu_ll_queue *llq = &q->llq;
u64 evt[PRIQ_ENT_DWORDS];
do {
while (!queue_remove_raw(q, evt))
arm_smmu_handle_ppr(smmu, evt);
- if (queue_sync_prod(q) == -EOVERFLOW)
+ if (queue_sync_prod_in(q) == -EOVERFLOW)
dev_err(smmu->dev, "PRIQ overflow detected -- requests lost\n");
- } while (!queue_empty(q));
+ } while (!queue_empty(llq));
/* Sync our overflow flag, as we believe we're up to speed */
- q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons);
- writel(q->cons, q->cons_reg);
+ llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) |
+ Q_IDX(llq, llq->cons);
+ queue_sync_cons_out(q);
return IRQ_HANDLED;
}
@@ -1534,6 +1927,23 @@ static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
return 0;
+ /*
+ * Ensure that we've completed prior invalidation of the main TLBs
+ * before we read 'nr_ats_masters' in case of a concurrent call to
+ * arm_smmu_enable_ats():
+ *
+ * // unmap() // arm_smmu_enable_ats()
+ * TLBI+SYNC atomic_inc(&nr_ats_masters);
+ * smp_mb(); [...]
+ * atomic_read(&nr_ats_masters); pci_enable_ats() // writel()
+ *
+ * Ensures that we always see the incremented 'nr_ats_masters' count if
+ * ATS was enabled at the PCI device before completion of the TLBI.
+ */
+ smp_mb();
+ if (!atomic_read(&smmu_domain->nr_ats_masters))
+ return 0;
+
arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
spin_lock_irqsave(&smmu_domain->devices_lock, flags);
@@ -1545,13 +1955,6 @@ static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
}
/* IO_PGTABLE API */
-static void arm_smmu_tlb_sync(void *cookie)
-{
- struct arm_smmu_domain *smmu_domain = cookie;
-
- arm_smmu_cmdq_issue_sync(smmu_domain->smmu);
-}
-
static void arm_smmu_tlb_inv_context(void *cookie)
{
struct arm_smmu_domain *smmu_domain = cookie;
@@ -1570,25 +1973,32 @@ static void arm_smmu_tlb_inv_context(void *cookie)
/*
* NOTE: when io-pgtable is in non-strict mode, we may get here with
* PTEs previously cleared by unmaps on the current CPU not yet visible
- * to the SMMU. We are relying on the DSB implicit in queue_inc_prod()
- * to guarantee those are observed before the TLBI. Do be careful, 007.
+ * to the SMMU. We are relying on the dma_wmb() implicit during cmd
+ * insertion to guarantee those are observed before the TLBI. Do be
+ * careful, 007.
*/
arm_smmu_cmdq_issue_cmd(smmu, &cmd);
arm_smmu_cmdq_issue_sync(smmu);
+ arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
}
-static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
- size_t granule, bool leaf, void *cookie)
+static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
+ size_t granule, bool leaf,
+ struct arm_smmu_domain *smmu_domain)
{
- struct arm_smmu_domain *smmu_domain = cookie;
+ u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
struct arm_smmu_device *smmu = smmu_domain->smmu;
+ unsigned long start = iova, end = iova + size;
+ int i = 0;
struct arm_smmu_cmdq_ent cmd = {
.tlbi = {
.leaf = leaf,
- .addr = iova,
},
};
+ if (!size)
+ return;
+
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
cmd.opcode = CMDQ_OP_TLBI_NH_VA;
cmd.tlbi.asid = smmu_domain->s1_cfg.cd.asid;
@@ -1597,16 +2007,54 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid;
}
- do {
- arm_smmu_cmdq_issue_cmd(smmu, &cmd);
- cmd.tlbi.addr += granule;
- } while (size -= granule);
+ while (iova < end) {
+ if (i == CMDQ_BATCH_ENTRIES) {
+ arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false);
+ i = 0;
+ }
+
+ cmd.tlbi.addr = iova;
+ arm_smmu_cmdq_build_cmd(&cmds[i * CMDQ_ENT_DWORDS], &cmd);
+ iova += granule;
+ i++;
+ }
+
+ arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, true);
+
+ /*
+ * Unfortunately, this can't be leaf-only since we may have
+ * zapped an entire table.
+ */
+ arm_smmu_atc_inv_domain(smmu_domain, 0, start, size);
+}
+
+static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t granule,
+ void *cookie)
+{
+ struct arm_smmu_domain *smmu_domain = cookie;
+ struct iommu_domain *domain = &smmu_domain->domain;
+
+ iommu_iotlb_gather_add_page(domain, gather, iova, granule);
+}
+
+static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ arm_smmu_tlb_inv_range(iova, size, granule, false, cookie);
+}
+
+static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ arm_smmu_tlb_inv_range(iova, size, granule, true, cookie);
}
-static const struct iommu_gather_ops arm_smmu_gather_ops = {
+static const struct iommu_flush_ops arm_smmu_flush_ops = {
.tlb_flush_all = arm_smmu_tlb_inv_context,
- .tlb_add_flush = arm_smmu_tlb_inv_range_nosync,
- .tlb_sync = arm_smmu_tlb_sync,
+ .tlb_flush_walk = arm_smmu_tlb_inv_walk,
+ .tlb_flush_leaf = arm_smmu_tlb_inv_leaf,
+ .tlb_add_page = arm_smmu_tlb_inv_page_nosync,
};
/* IOMMU API */
@@ -1796,7 +2244,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain)
.ias = ias,
.oas = oas,
.coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENCY,
- .tlb = &arm_smmu_gather_ops,
+ .tlb = &arm_smmu_flush_ops,
.iommu_dev = smmu->dev,
};
@@ -1863,44 +2311,58 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master)
}
}
-static int arm_smmu_enable_ats(struct arm_smmu_master *master)
+static bool arm_smmu_ats_supported(struct arm_smmu_master *master)
{
- int ret;
- size_t stu;
struct pci_dev *pdev;
struct arm_smmu_device *smmu = master->smmu;
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev);
if (!(smmu->features & ARM_SMMU_FEAT_ATS) || !dev_is_pci(master->dev) ||
!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_ATS) || pci_ats_disabled())
- return -ENXIO;
+ return false;
pdev = to_pci_dev(master->dev);
- if (pdev->untrusted)
- return -EPERM;
+ return !pdev->untrusted && pdev->ats_cap;
+}
+
+static void arm_smmu_enable_ats(struct arm_smmu_master *master)
+{
+ size_t stu;
+ struct pci_dev *pdev;
+ struct arm_smmu_device *smmu = master->smmu;
+ struct arm_smmu_domain *smmu_domain = master->domain;
+
+ /* Don't enable ATS at the endpoint if it's not enabled in the STE */
+ if (!master->ats_enabled)
+ return;
/* Smallest Translation Unit: log2 of the smallest supported granule */
stu = __ffs(smmu->pgsize_bitmap);
+ pdev = to_pci_dev(master->dev);
- ret = pci_enable_ats(pdev, stu);
- if (ret)
- return ret;
-
- master->ats_enabled = true;
- return 0;
+ atomic_inc(&smmu_domain->nr_ats_masters);
+ arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
+ if (pci_enable_ats(pdev, stu))
+ dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu);
}
static void arm_smmu_disable_ats(struct arm_smmu_master *master)
{
struct arm_smmu_cmdq_ent cmd;
+ struct arm_smmu_domain *smmu_domain = master->domain;
- if (!master->ats_enabled || !dev_is_pci(master->dev))
+ if (!master->ats_enabled)
return;
+ pci_disable_ats(to_pci_dev(master->dev));
+ /*
+ * Ensure ATS is disabled at the endpoint before we issue the
+ * ATC invalidation via the SMMU.
+ */
+ wmb();
arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd);
arm_smmu_atc_inv_master(master, &cmd);
- pci_disable_ats(to_pci_dev(master->dev));
- master->ats_enabled = false;
+ atomic_dec(&smmu_domain->nr_ats_masters);
}
static void arm_smmu_detach_dev(struct arm_smmu_master *master)
@@ -1911,14 +2373,15 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master)
if (!smmu_domain)
return;
+ arm_smmu_disable_ats(master);
+
spin_lock_irqsave(&smmu_domain->devices_lock, flags);
list_del(&master->domain_head);
spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
master->domain = NULL;
+ master->ats_enabled = false;
arm_smmu_install_ste_for_dev(master);
-
- arm_smmu_disable_ats(master);
}
static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
@@ -1958,17 +2421,20 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
master->domain = smmu_domain;
- spin_lock_irqsave(&smmu_domain->devices_lock, flags);
- list_add(&master->domain_head, &smmu_domain->devices);
- spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
-
if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS)
- arm_smmu_enable_ats(master);
+ master->ats_enabled = arm_smmu_ats_supported(master);
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1)
arm_smmu_write_ctx_desc(smmu, &smmu_domain->s1_cfg);
arm_smmu_install_ste_for_dev(master);
+
+ spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+ list_add(&master->domain_head, &smmu_domain->devices);
+ spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+
+ arm_smmu_enable_ats(master);
+
out_unlock:
mutex_unlock(&smmu_domain->init_mutex);
return ret;
@@ -1985,21 +2451,16 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
return ops->map(ops, iova, paddr, size, prot);
}
-static size_t
-arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
+static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
+ size_t size, struct iommu_iotlb_gather *gather)
{
- int ret;
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
if (!ops)
return 0;
- ret = ops->unmap(ops, iova, size);
- if (ret && arm_smmu_atc_inv_domain(smmu_domain, 0, iova, size))
- return 0;
-
- return ret;
+ return ops->unmap(ops, iova, size, gather);
}
static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain)
@@ -2010,12 +2471,13 @@ static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain)
arm_smmu_tlb_inv_context(smmu_domain);
}
-static void arm_smmu_iotlb_sync(struct iommu_domain *domain)
+static void arm_smmu_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
{
- struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
- if (smmu)
- arm_smmu_cmdq_issue_sync(smmu);
+ arm_smmu_tlb_inv_range(gather->start, gather->end - gather->start,
+ gather->pgsize, true, smmu_domain);
}
static phys_addr_t
@@ -2286,13 +2748,13 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
size_t qsz;
do {
- qsz = ((1 << q->max_n_shift) * dwords) << 3;
+ qsz = ((1 << q->llq.max_n_shift) * dwords) << 3;
q->base = dmam_alloc_coherent(smmu->dev, qsz, &q->base_dma,
GFP_KERNEL);
if (q->base || qsz < PAGE_SIZE)
break;
- q->max_n_shift--;
+ q->llq.max_n_shift--;
} while (1);
if (!q->base) {
@@ -2304,7 +2766,7 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
if (!WARN_ON(q->base_dma & (qsz - 1))) {
dev_info(smmu->dev, "allocated %u entries for %s\n",
- 1 << q->max_n_shift, name);
+ 1 << q->llq.max_n_shift, name);
}
q->prod_reg = arm_smmu_page1_fixup(prod_off, smmu);
@@ -2313,24 +2775,55 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
q->q_base = Q_BASE_RWA;
q->q_base |= q->base_dma & Q_BASE_ADDR_MASK;
- q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->max_n_shift);
+ q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->llq.max_n_shift);
- q->prod = q->cons = 0;
+ q->llq.prod = q->llq.cons = 0;
return 0;
}
+static void arm_smmu_cmdq_free_bitmap(void *data)
+{
+ unsigned long *bitmap = data;
+ bitmap_free(bitmap);
+}
+
+static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
+{
+ int ret = 0;
+ struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+ unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
+ atomic_long_t *bitmap;
+
+ atomic_set(&cmdq->owner_prod, 0);
+ atomic_set(&cmdq->lock, 0);
+
+ bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
+ if (!bitmap) {
+ dev_err(smmu->dev, "failed to allocate cmdq bitmap\n");
+ ret = -ENOMEM;
+ } else {
+ cmdq->valid_map = bitmap;
+ devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap);
+ }
+
+ return ret;
+}
+
static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
{
int ret;
/* cmdq */
- spin_lock_init(&smmu->cmdq.lock);
ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD,
ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS,
"cmdq");
if (ret)
return ret;
+ ret = arm_smmu_cmdq_init(smmu);
+ if (ret)
+ return ret;
+
/* evtq */
ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS,
@@ -2708,8 +3201,8 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
/* Command queue */
writeq_relaxed(smmu->cmdq.q.q_base, smmu->base + ARM_SMMU_CMDQ_BASE);
- writel_relaxed(smmu->cmdq.q.prod, smmu->base + ARM_SMMU_CMDQ_PROD);
- writel_relaxed(smmu->cmdq.q.cons, smmu->base + ARM_SMMU_CMDQ_CONS);
+ writel_relaxed(smmu->cmdq.q.llq.prod, smmu->base + ARM_SMMU_CMDQ_PROD);
+ writel_relaxed(smmu->cmdq.q.llq.cons, smmu->base + ARM_SMMU_CMDQ_CONS);
enables = CR0_CMDQEN;
ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
@@ -2736,9 +3229,9 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
/* Event queue */
writeq_relaxed(smmu->evtq.q.q_base, smmu->base + ARM_SMMU_EVTQ_BASE);
- writel_relaxed(smmu->evtq.q.prod,
+ writel_relaxed(smmu->evtq.q.llq.prod,
arm_smmu_page1_fixup(ARM_SMMU_EVTQ_PROD, smmu));
- writel_relaxed(smmu->evtq.q.cons,
+ writel_relaxed(smmu->evtq.q.llq.cons,
arm_smmu_page1_fixup(ARM_SMMU_EVTQ_CONS, smmu));
enables |= CR0_EVTQEN;
@@ -2753,9 +3246,9 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
if (smmu->features & ARM_SMMU_FEAT_PRI) {
writeq_relaxed(smmu->priq.q.q_base,
smmu->base + ARM_SMMU_PRIQ_BASE);
- writel_relaxed(smmu->priq.q.prod,
+ writel_relaxed(smmu->priq.q.llq.prod,
arm_smmu_page1_fixup(ARM_SMMU_PRIQ_PROD, smmu));
- writel_relaxed(smmu->priq.q.cons,
+ writel_relaxed(smmu->priq.q.llq.cons,
arm_smmu_page1_fixup(ARM_SMMU_PRIQ_CONS, smmu));
enables |= CR0_PRIQEN;
@@ -2909,18 +3402,24 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
}
/* Queue sizes, capped to ensure natural alignment */
- smmu->cmdq.q.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT,
- FIELD_GET(IDR1_CMDQS, reg));
- if (!smmu->cmdq.q.max_n_shift) {
- /* Odd alignment restrictions on the base, so ignore for now */
- dev_err(smmu->dev, "unit-length command queue not supported\n");
+ smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT,
+ FIELD_GET(IDR1_CMDQS, reg));
+ if (smmu->cmdq.q.llq.max_n_shift <= ilog2(CMDQ_BATCH_ENTRIES)) {
+ /*
+ * We don't support splitting up batches, so one batch of
+ * commands plus an extra sync needs to fit inside the command
+ * queue. There's also no way we can handle the weird alignment
+ * restrictions on the base pointer for a unit-length queue.
+ */
+ dev_err(smmu->dev, "command queue size <= %d entries not supported\n",
+ CMDQ_BATCH_ENTRIES);
return -ENXIO;
}
- smmu->evtq.q.max_n_shift = min_t(u32, EVTQ_MAX_SZ_SHIFT,
- FIELD_GET(IDR1_EVTQS, reg));
- smmu->priq.q.max_n_shift = min_t(u32, PRIQ_MAX_SZ_SHIFT,
- FIELD_GET(IDR1_PRIQS, reg));
+ smmu->evtq.q.llq.max_n_shift = min_t(u32, EVTQ_MAX_SZ_SHIFT,
+ FIELD_GET(IDR1_EVTQS, reg));
+ smmu->priq.q.llq.max_n_shift = min_t(u32, PRIQ_MAX_SZ_SHIFT,
+ FIELD_GET(IDR1_PRIQS, reg));
/* SID/SSID sizes */
smmu->ssid_bits = FIELD_GET(IDR1_SSIDSIZE, reg);