summaryrefslogtreecommitdiffstats
path: root/tools/perf/util/arm-spe.c
diff options
context:
space:
mode:
Diffstat (limited to 'tools/perf/util/arm-spe.c')
-rw-r--r--tools/perf/util/arm-spe.c321
1 files changed, 280 insertions, 41 deletions
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 138ffc71b32d..dbf13f47879c 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -46,7 +46,6 @@ struct arm_spe {
struct perf_session *session;
struct machine *machine;
u32 pmu_type;
- u64 midr;
struct perf_tsc_conversion tc;
@@ -69,7 +68,7 @@ struct arm_spe {
u64 llc_access_id;
u64 tlb_miss_id;
u64 tlb_access_id;
- u64 branch_miss_id;
+ u64 branch_id;
u64 remote_access_id;
u64 memory_id;
u64 instructions_id;
@@ -78,6 +77,11 @@ struct arm_spe {
unsigned long num_events;
u8 use_ctx_pkt_for_pid;
+
+ u64 **metadata;
+ u64 metadata_ver;
+ u64 metadata_nr_cpu;
+ bool is_homogeneous;
};
struct arm_spe_queue {
@@ -96,6 +100,7 @@ struct arm_spe_queue {
u64 timestamp;
struct thread *thread;
u64 period_instructions;
+ u32 flags;
};
static void arm_spe_dump(struct arm_spe *spe __maybe_unused,
@@ -118,7 +123,7 @@ static void arm_spe_dump(struct arm_spe *spe __maybe_unused,
else
pkt_len = 1;
printf(".");
- color_fprintf(stdout, color, " %08x: ", pos);
+ color_fprintf(stdout, color, " %08zx: ", pos);
for (i = 0; i < pkt_len; i++)
color_fprintf(stdout, color, " %02x", buf[i]);
for (; i < 16; i++)
@@ -273,6 +278,20 @@ static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)
return 0;
}
+static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, u64 cpu)
+{
+ u64 i;
+
+ if (!spe->metadata)
+ return NULL;
+
+ for (i = 0; i < spe->metadata_nr_cpu; i++)
+ if (spe->metadata[i][ARM_SPE_CPU] == cpu)
+ return spe->metadata[i];
+
+ return NULL;
+}
+
static struct simd_flags arm_spe__synth_simd_flags(const struct arm_spe_record *record)
{
struct simd_flags simd_flags = {};
@@ -376,6 +395,7 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
sample.stream_id = spe_events_id;
sample.addr = record->to_ip;
sample.weight = record->latency;
+ sample.flags = speq->flags;
return arm_spe_deliver_synth_event(spe, speq, event, &sample);
}
@@ -400,24 +420,44 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
sample.id = spe_events_id;
sample.stream_id = spe_events_id;
- sample.addr = record->virt_addr;
+ sample.addr = record->to_ip;
sample.phys_addr = record->phys_addr;
sample.data_src = data_src;
sample.period = spe->instructions_sample_period;
sample.weight = record->latency;
+ sample.flags = speq->flags;
return arm_spe_deliver_synth_event(spe, speq, event, &sample);
}
-static const struct midr_range neoverse_spe[] = {
+static const struct midr_range common_ds_encoding_cpus[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A725),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_X925),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
{},
};
-static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record,
- union perf_mem_data_src *data_src)
+static void arm_spe__sample_flags(struct arm_spe_queue *speq)
+{
+ const struct arm_spe_record *record = &speq->decoder->record;
+
+ speq->flags = 0;
+ if (record->op & ARM_SPE_OP_BRANCH_ERET) {
+ speq->flags = PERF_IP_FLAG_BRANCH;
+
+ if (record->type & ARM_SPE_BRANCH_MISS)
+ speq->flags |= PERF_IP_FLAG_BRANCH_MISS;
+ }
+}
+
+static void arm_spe__synth_data_source_common(const struct arm_spe_record *record,
+ union perf_mem_data_src *data_src)
{
/*
* Even though four levels of cache hierarchy are possible, no known
@@ -439,17 +479,17 @@ static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *rec
}
switch (record->source) {
- case ARM_SPE_NV_L1D:
+ case ARM_SPE_COMMON_DS_L1D:
data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
break;
- case ARM_SPE_NV_L2:
+ case ARM_SPE_COMMON_DS_L2:
data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
break;
- case ARM_SPE_NV_PEER_CORE:
+ case ARM_SPE_COMMON_DS_PEER_CORE:
data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
@@ -458,8 +498,8 @@ static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *rec
* We don't know if this is L1, L2 but we do know it was a cache-2-cache
* transfer, so set SNOOPX_PEER
*/
- case ARM_SPE_NV_LOCAL_CLUSTER:
- case ARM_SPE_NV_PEER_CLUSTER:
+ case ARM_SPE_COMMON_DS_LOCAL_CLUSTER:
+ case ARM_SPE_COMMON_DS_PEER_CLUSTER:
data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
@@ -467,7 +507,7 @@ static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *rec
/*
* System cache is assumed to be L3
*/
- case ARM_SPE_NV_SYS_CACHE:
+ case ARM_SPE_COMMON_DS_SYS_CACHE:
data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
@@ -476,13 +516,13 @@ static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *rec
* We don't know what level it hit in, except it came from the other
* socket
*/
- case ARM_SPE_NV_REMOTE:
+ case ARM_SPE_COMMON_DS_REMOTE:
data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1;
data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
break;
- case ARM_SPE_NV_DRAM:
+ case ARM_SPE_COMMON_DS_DRAM:
data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
@@ -492,8 +532,8 @@ static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *rec
}
}
-static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record,
- union perf_mem_data_src *data_src)
+static void arm_spe__synth_memory_level(const struct arm_spe_record *record,
+ union perf_mem_data_src *data_src)
{
if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
data_src->mem_lvl = PERF_MEM_LVL_L3;
@@ -515,10 +555,55 @@ static void arm_spe__synth_data_source_generic(const struct arm_spe_record *reco
data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1;
}
-static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
+static bool arm_spe__is_common_ds_encoding(struct arm_spe_queue *speq)
+{
+ struct arm_spe *spe = speq->spe;
+ bool is_in_cpu_list;
+ u64 *metadata = NULL;
+ u64 midr = 0;
+
+ /* Metadata version 1 assumes all CPUs are the same (old behavior) */
+ if (spe->metadata_ver == 1) {
+ const char *cpuid;
+
+ pr_warning_once("Old SPE metadata, re-record to improve decode accuracy\n");
+ cpuid = perf_env__cpuid(spe->session->evlist->env);
+ midr = strtol(cpuid, NULL, 16);
+ } else {
+ /* CPU ID is -1 for per-thread mode */
+ if (speq->cpu < 0) {
+ /*
+ * On the heterogeneous system, due to CPU ID is -1,
+ * cannot confirm the data source packet is supported.
+ */
+ if (!spe->is_homogeneous)
+ return false;
+
+ /* In homogeneous system, simply use CPU0's metadata */
+ if (spe->metadata)
+ metadata = spe->metadata[0];
+ } else {
+ metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu);
+ }
+
+ if (!metadata)
+ return false;
+
+ midr = metadata[ARM_SPE_CPU_MIDR];
+ }
+
+ is_in_cpu_list = is_midr_in_range_list(midr, common_ds_encoding_cpus);
+ if (is_in_cpu_list)
+ return true;
+ else
+ return false;
+}
+
+static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq,
+ const struct arm_spe_record *record)
{
union perf_mem_data_src data_src = { .mem_op = PERF_MEM_OP_NA };
- bool is_neoverse = is_midr_in_range_list(midr, neoverse_spe);
+ bool is_common = arm_spe__is_common_ds_encoding(speq);
if (record->op & ARM_SPE_OP_LD)
data_src.mem_op = PERF_MEM_OP_LOAD;
@@ -527,10 +612,10 @@ static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 m
else
return 0;
- if (is_neoverse)
- arm_spe__synth_data_source_neoverse(record, &data_src);
+ if (is_common)
+ arm_spe__synth_data_source_common(record, &data_src);
else
- arm_spe__synth_data_source_generic(record, &data_src);
+ arm_spe__synth_memory_level(record, &data_src);
if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
data_src.mem_dtlb = PERF_MEM_TLB_WK;
@@ -551,7 +636,8 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
u64 data_src;
int err;
- data_src = arm_spe__synth_data_source(record, spe->midr);
+ arm_spe__sample_flags(speq);
+ data_src = arm_spe__synth_data_source(speq, record);
if (spe->sample_flc) {
if (record->type & ARM_SPE_L1D_MISS) {
@@ -601,8 +687,8 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
}
}
- if (spe->sample_branch && (record->type & ARM_SPE_BRANCH_MISS)) {
- err = arm_spe__synth_branch_sample(speq, spe->branch_miss_id);
+ if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) {
+ err = arm_spe__synth_branch_sample(speq, spe->branch_id);
if (err)
return err;
}
@@ -1016,6 +1102,73 @@ static int arm_spe_flush(struct perf_session *session __maybe_unused,
return 0;
}
+static u64 *arm_spe__alloc_per_cpu_metadata(u64 *buf, int per_cpu_size)
+{
+ u64 *metadata;
+
+ metadata = zalloc(per_cpu_size);
+ if (!metadata)
+ return NULL;
+
+ memcpy(metadata, buf, per_cpu_size);
+ return metadata;
+}
+
+static void arm_spe__free_metadata(u64 **metadata, int nr_cpu)
+{
+ int i;
+
+ for (i = 0; i < nr_cpu; i++)
+ zfree(&metadata[i]);
+ free(metadata);
+}
+
+static u64 **arm_spe__alloc_metadata(struct perf_record_auxtrace_info *info,
+ u64 *ver, int *nr_cpu)
+{
+ u64 *ptr = (u64 *)info->priv;
+ u64 metadata_size;
+ u64 **metadata = NULL;
+ int hdr_sz, per_cpu_sz, i;
+
+ metadata_size = info->header.size -
+ sizeof(struct perf_record_auxtrace_info);
+
+ /* Metadata version 1 */
+ if (metadata_size == ARM_SPE_AUXTRACE_V1_PRIV_SIZE) {
+ *ver = 1;
+ *nr_cpu = 0;
+ /* No per CPU metadata */
+ return NULL;
+ }
+
+ *ver = ptr[ARM_SPE_HEADER_VERSION];
+ hdr_sz = ptr[ARM_SPE_HEADER_SIZE];
+ *nr_cpu = ptr[ARM_SPE_CPUS_NUM];
+
+ metadata = calloc(*nr_cpu, sizeof(*metadata));
+ if (!metadata)
+ return NULL;
+
+ /* Locate the start address of per CPU metadata */
+ ptr += hdr_sz;
+ per_cpu_sz = (metadata_size - (hdr_sz * sizeof(u64))) / (*nr_cpu);
+
+ for (i = 0; i < *nr_cpu; i++) {
+ metadata[i] = arm_spe__alloc_per_cpu_metadata(ptr, per_cpu_sz);
+ if (!metadata[i])
+ goto err_per_cpu_metadata;
+
+ ptr += per_cpu_sz / sizeof(u64);
+ }
+
+ return metadata;
+
+err_per_cpu_metadata:
+ arm_spe__free_metadata(metadata, *nr_cpu);
+ return NULL;
+}
+
static void arm_spe_free_queue(void *priv)
{
struct arm_spe_queue *speq = priv;
@@ -1050,6 +1203,7 @@ static void arm_spe_free(struct perf_session *session)
auxtrace_heap__free(&spe->heap);
arm_spe_free_events(session);
session->auxtrace = NULL;
+ arm_spe__free_metadata(spe->metadata, spe->metadata_nr_cpu);
free(spe);
}
@@ -1061,16 +1215,60 @@ static bool arm_spe_evsel_is_auxtrace(struct perf_session *session,
return evsel->core.attr.type == spe->pmu_type;
}
-static const char * const arm_spe_info_fmts[] = {
- [ARM_SPE_PMU_TYPE] = " PMU Type %"PRId64"\n",
+static const char * const metadata_hdr_v1_fmts[] = {
+ [ARM_SPE_PMU_TYPE] = " PMU Type :%"PRId64"\n",
+ [ARM_SPE_PER_CPU_MMAPS] = " Per CPU mmaps :%"PRId64"\n",
+};
+
+static const char * const metadata_hdr_fmts[] = {
+ [ARM_SPE_HEADER_VERSION] = " Header version :%"PRId64"\n",
+ [ARM_SPE_HEADER_SIZE] = " Header size :%"PRId64"\n",
+ [ARM_SPE_PMU_TYPE_V2] = " PMU type v2 :%"PRId64"\n",
+ [ARM_SPE_CPUS_NUM] = " CPU number :%"PRId64"\n",
+};
+
+static const char * const metadata_per_cpu_fmts[] = {
+ [ARM_SPE_MAGIC] = " Magic :0x%"PRIx64"\n",
+ [ARM_SPE_CPU] = " CPU # :%"PRId64"\n",
+ [ARM_SPE_CPU_NR_PARAMS] = " Num of params :%"PRId64"\n",
+ [ARM_SPE_CPU_MIDR] = " MIDR :0x%"PRIx64"\n",
+ [ARM_SPE_CPU_PMU_TYPE] = " PMU Type :%"PRId64"\n",
+ [ARM_SPE_CAP_MIN_IVAL] = " Min Interval :%"PRId64"\n",
};
-static void arm_spe_print_info(__u64 *arr)
+static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr)
{
+ unsigned int i, cpu, hdr_size, cpu_num, cpu_size;
+ const char * const *hdr_fmts;
+
if (!dump_trace)
return;
- fprintf(stdout, arm_spe_info_fmts[ARM_SPE_PMU_TYPE], arr[ARM_SPE_PMU_TYPE]);
+ if (spe->metadata_ver == 1) {
+ cpu_num = 0;
+ hdr_size = ARM_SPE_AUXTRACE_V1_PRIV_MAX;
+ hdr_fmts = metadata_hdr_v1_fmts;
+ } else {
+ cpu_num = arr[ARM_SPE_CPUS_NUM];
+ hdr_size = arr[ARM_SPE_HEADER_SIZE];
+ hdr_fmts = metadata_hdr_fmts;
+ }
+
+ for (i = 0; i < hdr_size; i++)
+ fprintf(stdout, hdr_fmts[i], arr[i]);
+
+ arr += hdr_size;
+ for (cpu = 0; cpu < cpu_num; cpu++) {
+ /*
+ * The parameters from ARM_SPE_MAGIC to ARM_SPE_CPU_NR_PARAMS
+ * are fixed. The sequential parameter size is decided by the
+ * field 'ARM_SPE_CPU_NR_PARAMS'.
+ */
+ cpu_size = (ARM_SPE_CPU_NR_PARAMS + 1) + arr[ARM_SPE_CPU_NR_PARAMS];
+ for (i = 0; i < cpu_size; i++)
+ fprintf(stdout, metadata_per_cpu_fmts[i], arr[i]);
+ arr += cpu_size;
+ }
}
static void arm_spe_set_event_name(struct evlist *evlist, u64 id,
@@ -1202,12 +1400,12 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
if (spe->synth_opts.branches) {
spe->sample_branch = true;
- /* Branch miss */
+ /* Branch */
err = perf_session__deliver_synth_attr_event(session, &attr, id);
if (err)
return err;
- spe->branch_miss_id = id;
- arm_spe_set_event_name(evlist, id, "branch-miss");
+ spe->branch_id = id;
+ arm_spe_set_event_name(evlist, id, "branch");
id += 1;
}
@@ -1258,24 +1456,57 @@ synth_instructions_out:
return 0;
}
+static bool arm_spe__is_homogeneous(u64 **metadata, int nr_cpu)
+{
+ u64 midr;
+ int i;
+
+ if (!nr_cpu)
+ return false;
+
+ for (i = 0; i < nr_cpu; i++) {
+ if (!metadata[i])
+ return false;
+
+ if (i == 0) {
+ midr = metadata[i][ARM_SPE_CPU_MIDR];
+ continue;
+ }
+
+ if (midr != metadata[i][ARM_SPE_CPU_MIDR])
+ return false;
+ }
+
+ return true;
+}
+
int arm_spe_process_auxtrace_info(union perf_event *event,
struct perf_session *session)
{
struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
- size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX;
+ size_t min_sz = ARM_SPE_AUXTRACE_V1_PRIV_SIZE;
struct perf_record_time_conv *tc = &session->time_conv;
- const char *cpuid = perf_env__cpuid(session->evlist->env);
- u64 midr = strtol(cpuid, NULL, 16);
struct arm_spe *spe;
- int err;
+ u64 **metadata = NULL;
+ u64 metadata_ver;
+ int nr_cpu, err;
if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info) +
min_sz)
return -EINVAL;
+ metadata = arm_spe__alloc_metadata(auxtrace_info, &metadata_ver,
+ &nr_cpu);
+ if (!metadata && metadata_ver != 1) {
+ pr_err("Failed to parse Arm SPE metadata.\n");
+ return -EINVAL;
+ }
+
spe = zalloc(sizeof(struct arm_spe));
- if (!spe)
- return -ENOMEM;
+ if (!spe) {
+ err = -ENOMEM;
+ goto err_free_metadata;
+ }
err = auxtrace_queues__init(&spe->queues);
if (err)
@@ -1284,8 +1515,14 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
spe->session = session;
spe->machine = &session->machines.host; /* No kvm support */
spe->auxtrace_type = auxtrace_info->type;
- spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
- spe->midr = midr;
+ if (metadata_ver == 1)
+ spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
+ else
+ spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE_V2];
+ spe->metadata = metadata;
+ spe->metadata_ver = metadata_ver;
+ spe->metadata_nr_cpu = nr_cpu;
+ spe->is_homogeneous = arm_spe__is_homogeneous(metadata, nr_cpu);
spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
@@ -1318,7 +1555,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace;
session->auxtrace = &spe->auxtrace;
- arm_spe_print_info(&auxtrace_info->priv[0]);
+ arm_spe_print_info(spe, &auxtrace_info->priv[0]);
if (dump_trace)
return 0;
@@ -1346,5 +1583,7 @@ err_free_queues:
session->auxtrace = NULL;
err_free:
free(spe);
+err_free_metadata:
+ arm_spe__free_metadata(metadata, nr_cpu);
return err;
}