diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-15 00:31:23 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-15 00:31:23 +0100 |
commit | 1bbeaf83dd7b5e3628b98bec66ff8fe2646e14aa (patch) | |
tree | a391eed8ae206613b48e02e56e6ad5c4432d8767 /tools/perf/builtin-report.c | |
parent | Merge tag 'trace-ring-buffer-v6.8-rc7-2' of git://git.kernel.org/pub/scm/linu... (diff) | |
parent | perf annotate: Add comments in the data structures (diff) | |
download | linux-1bbeaf83dd7b5e3628b98bec66ff8fe2646e14aa.tar.xz linux-1bbeaf83dd7b5e3628b98bec66ff8fe2646e14aa.zip |
Merge tag 'perf-tools-for-v6.9-2024-03-13' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools
Pull perf tools updates from Namhyung Kim:
"perf stat:
- Support new 'cluster' aggregation mode for shared resources
depending on the hardware configuration:
$ sudo perf stat -a --per-cluster -e cycles,instructions sleep 1
Performance counter stats for 'system wide':
S0-D0-CLS0 2 85,051,822 cycles
S0-D0-CLS0 2 73,909,908 instructions # 0.87 insn per cycle
S0-D0-CLS2 2 93,365,918 cycles
S0-D0-CLS2 2 83,006,158 instructions # 0.89 insn per cycle
S0-D0-CLS4 2 104,157,523 cycles
S0-D0-CLS4 2 53,234,396 instructions # 0.51 insn per cycle
S0-D0-CLS6 2 65,891,079 cycles
S0-D0-CLS6 2 41,478,273 instructions # 0.63 insn per cycle
1.002407989 seconds time elapsed
- Various fixes and cleanups for event metrics including NaN handling
perf script:
- Use libcapstone if available to disassemble the instructions. This
enables 'perf script -F disasm' and 'perf script --insn-trace=disasm'
(for Intel-PT):
$ perf script -F event,ip,disasm
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffa9839d25 movq %rax, %r14
cycles:P: ffffffffa9cdcaf0 endbr64
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffaa401f86 iretq
cycles:P: ffffffffa99c4de5 movq 0x30(%rcx), %r8
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffaa401f86 iretq
cycles:P: ffffffffa9907983 movl 0x68(%rbx), %eax
cycles:P: ffffffffa988d428 wrmsr
- Expose sample ID / stream ID to python scripts
perf test:
- Add more perf test cases from Redhat internal test suites. This
time it adds the base infra and a few perf probe tests. More to
come. :)
- Add 'perf test -p' for parallel execution and fix some issues found
by the parallel test
- Support symbol test to print symbols in given (active) module:
$ perf test -F -v Symbols --dso /lib/modules/$(uname -r)/kernel/fs/ext4/ext4.ko
--- start ---
Testing /lib/modules/6.5.13-1rodete2-amd64/kernel/fs/ext4/ext4.ko
Overlapping symbols:
7a990-7a9a0 l __pfx_ext4_exit_fs
7a990-7a9a0 g __pfx_cleanup_module
Overlapping symbols:
7a9a0-7aa1c l ext4_exit_fs
7a9a0-7aa1c g cleanup_module
...
JSON metric updates:
- A new round of Intel metric updates
- Support Power11 PVR (compatible to Power10)
- Fix cache latency events on Zen 4 to set SliceId properly
Internal:
- Fix reference counting for 'map' data structure, tireless work from
Ian!
- More memory optimization for struct thread and annotate histogram.
Now, 'perf report' (TUI) and 'perf annotate' should be much
lighter-weight in terms of memory footprint
- Support cross-arch perf register access. Clean up the build
configuration so that it can detect arch-register support at
runtime. This can allow to parse register data in sample which was
recorded in a different arch
Others:
- Sync task state in 'perf sched' to kernel using trace event fields.
The task states have been changed so tools cannot assume a fixed
encoding
- Clean up 'perf mem' to generalize the arch-specific events
- Add support for local and global variables to data type profiling.
This would increase the success rate of type resolution with DWARF
- Add short option -H for --hierarchy in 'perf report' and 'perf top'"
* tag 'perf-tools-for-v6.9-2024-03-13' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools: (154 commits)
perf annotate: Add comments in the data structures
perf annotate: Remove sym_hist.addr[] array
perf annotate: Calculate instruction overhead using hashmap
perf annotate: Add a hashmap for symbol histogram
perf threads: Reduce table size from 256 to 8
perf threads: Switch from rbtree to hashmap
perf threads: Move threads to its own files
perf machine: Move machine's threads into its own abstraction
perf machine: Move fprintf to for_each loop and a callback
perf trace: Ignore thread hashing in summary
perf report: Sort child tasks by tid
perf vendor events amd: Fix Zen 4 cache latency events
perf version: Display availability of OpenCSD support
perf vendor events intel: Add umasks/occ_sel to PCU events.
perf map: Fix map reference count issues
libperf evlist: Avoid out-of-bounds access
perf lock contention: Account contending locks too
perf metrics: Fix segv for metrics with no events
perf metrics: Fix metric matching
perf pmu: Fix a potential memory leak in perf_pmu__lookup()
...
Diffstat (limited to 'tools/perf/builtin-report.c')
-rw-r--r-- | tools/perf/builtin-report.c | 221 |
1 files changed, 131 insertions, 90 deletions
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index f2ed2b7e80a3..dcd93ee5fc24 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -59,6 +59,7 @@ #include <linux/ctype.h> #include <signal.h> #include <linux/bitmap.h> +#include <linux/list_sort.h> #include <linux/string.h> #include <linux/stringify.h> #include <linux/time64.h> @@ -828,35 +829,6 @@ static void tasks_setup(struct report *rep) rep->tool.no_warn = true; } -struct task { - struct thread *thread; - struct list_head list; - struct list_head children; -}; - -static struct task *tasks_list(struct task *task, struct machine *machine) -{ - struct thread *parent_thread, *thread = task->thread; - struct task *parent_task; - - /* Already listed. */ - if (!list_empty(&task->list)) - return NULL; - - /* Last one in the chain. */ - if (thread__ppid(thread) == -1) - return task; - - parent_thread = machine__find_thread(machine, -1, thread__ppid(thread)); - if (!parent_thread) - return ERR_PTR(-ENOENT); - - parent_task = thread__priv(parent_thread); - thread__put(parent_thread); - list_add_tail(&task->list, &parent_task->children); - return tasks_list(parent_task, machine); -} - struct maps__fprintf_task_args { int indent; FILE *fp; @@ -900,89 +872,156 @@ static size_t maps__fprintf_task(struct maps *maps, int indent, FILE *fp) return args.printed; } -static void task__print_level(struct task *task, FILE *fp, int level) +static int thread_level(struct machine *machine, const struct thread *thread) { - struct thread *thread = task->thread; - struct task *child; - int comm_indent = fprintf(fp, " %8d %8d %8d |%*s", - thread__pid(thread), thread__tid(thread), - thread__ppid(thread), level, ""); + struct thread *parent_thread; + int res; - fprintf(fp, "%s\n", thread__comm_str(thread)); + if (thread__tid(thread) <= 0) + return 0; - maps__fprintf_task(thread__maps(thread), comm_indent, fp); + if (thread__ppid(thread) <= 0) + return 1; - if (!list_empty(&task->children)) { - list_for_each_entry(child, &task->children, list) - task__print_level(child, fp, level + 1); + parent_thread = machine__find_thread(machine, -1, thread__ppid(thread)); + if (!parent_thread) { + pr_err("Missing parent thread of %d\n", thread__tid(thread)); + return 0; } + res = 1 + thread_level(machine, parent_thread); + thread__put(parent_thread); + return res; } -static int tasks_print(struct report *rep, FILE *fp) +static void task__print_level(struct machine *machine, struct thread *thread, FILE *fp) { - struct perf_session *session = rep->session; - struct machine *machine = &session->machines.host; - struct task *tasks, *task; - unsigned int nr = 0, itask = 0, i; - struct rb_node *nd; - LIST_HEAD(list); + int level = thread_level(machine, thread); + int comm_indent = fprintf(fp, " %8d %8d %8d |%*s", + thread__pid(thread), thread__tid(thread), + thread__ppid(thread), level, ""); - /* - * No locking needed while accessing machine->threads, - * because --tasks is single threaded command. - */ + fprintf(fp, "%s\n", thread__comm_str(thread)); - /* Count all the threads. */ - for (i = 0; i < THREADS__TABLE_SIZE; i++) - nr += machine->threads[i].nr; + maps__fprintf_task(thread__maps(thread), comm_indent, fp); +} - tasks = malloc(sizeof(*tasks) * nr); - if (!tasks) - return -ENOMEM; +/* + * Sort two thread list nodes such that they form a tree. The first node is the + * root of the tree, its children are ordered numerically after it. If a child + * has children itself then they appear immediately after their parent. For + * example, the 4 threads in the order they'd appear in the list: + * - init with a TID 1 and a parent of 0 + * - systemd with a TID 3000 and a parent of init/1 + * - systemd child thread with TID 4000, the parent is 3000 + * - NetworkManager is a child of init with a TID of 3500. + */ +static int task_list_cmp(void *priv, const struct list_head *la, const struct list_head *lb) +{ + struct machine *machine = priv; + struct thread_list *task_a = list_entry(la, struct thread_list, list); + struct thread_list *task_b = list_entry(lb, struct thread_list, list); + struct thread *a = task_a->thread; + struct thread *b = task_b->thread; + int level_a, level_b, res; + + /* Same thread? */ + if (thread__tid(a) == thread__tid(b)) + return 0; - for (i = 0; i < THREADS__TABLE_SIZE; i++) { - struct threads *threads = &machine->threads[i]; + /* Compare a and b to root. */ + if (thread__tid(a) == 0) + return -1; - for (nd = rb_first_cached(&threads->entries); nd; - nd = rb_next(nd)) { - task = tasks + itask++; + if (thread__tid(b) == 0) + return 1; - task->thread = rb_entry(nd, struct thread_rb_node, rb_node)->thread; - INIT_LIST_HEAD(&task->children); - INIT_LIST_HEAD(&task->list); - thread__set_priv(task->thread, task); - } - } + /* If parents match sort by tid. */ + if (thread__ppid(a) == thread__ppid(b)) + return thread__tid(a) < thread__tid(b) ? -1 : 1; /* - * Iterate every task down to the unprocessed parent - * and link all in task children list. Task with no - * parent is added into 'list'. + * Find a and b such that if they are a child of each other a and b's + * tid's match, otherwise a and b have a common parent and distinct + * tid's to sort by. First make the depths of the threads match. */ - for (itask = 0; itask < nr; itask++) { - task = tasks + itask; - - if (!list_empty(&task->list)) - continue; - - task = tasks_list(task, machine); - if (IS_ERR(task)) { - pr_err("Error: failed to process tasks\n"); - free(tasks); - return PTR_ERR(task); + level_a = thread_level(machine, a); + level_b = thread_level(machine, b); + a = thread__get(a); + b = thread__get(b); + for (int i = level_a; i > level_b; i--) { + struct thread *parent = machine__find_thread(machine, -1, thread__ppid(a)); + + thread__put(a); + if (!parent) { + pr_err("Missing parent thread of %d\n", thread__tid(a)); + thread__put(b); + return -1; } + a = parent; + } + for (int i = level_b; i > level_a; i--) { + struct thread *parent = machine__find_thread(machine, -1, thread__ppid(b)); - if (task) - list_add_tail(&task->list, &list); + thread__put(b); + if (!parent) { + pr_err("Missing parent thread of %d\n", thread__tid(b)); + thread__put(a); + return 1; + } + b = parent; + } + /* Search up to a common parent. */ + while (thread__ppid(a) != thread__ppid(b)) { + struct thread *parent; + + parent = machine__find_thread(machine, -1, thread__ppid(a)); + thread__put(a); + if (!parent) + pr_err("Missing parent thread of %d\n", thread__tid(a)); + a = parent; + parent = machine__find_thread(machine, -1, thread__ppid(b)); + thread__put(b); + if (!parent) + pr_err("Missing parent thread of %d\n", thread__tid(b)); + b = parent; + if (!a || !b) { + /* Handle missing parent (unexpected) with some sanity. */ + thread__put(a); + thread__put(b); + return !a && !b ? 0 : (!a ? -1 : 1); + } + } + if (thread__tid(a) == thread__tid(b)) { + /* a is a child of b or vice-versa, deeper levels appear later. */ + res = level_a < level_b ? -1 : (level_a > level_b ? 1 : 0); + } else { + /* Sort by tid now the parent is the same. */ + res = thread__tid(a) < thread__tid(b) ? -1 : 1; } + thread__put(a); + thread__put(b); + return res; +} - fprintf(fp, "# %8s %8s %8s %s\n", "pid", "tid", "ppid", "comm"); +static int tasks_print(struct report *rep, FILE *fp) +{ + struct machine *machine = &rep->session->machines.host; + LIST_HEAD(tasks); + int ret; - list_for_each_entry(task, &list, list) - task__print_level(task, fp, 0); + ret = machine__thread_list(machine, &tasks); + if (!ret) { + struct thread_list *task; - free(tasks); - return 0; + list_sort(machine, &tasks, task_list_cmp); + + fprintf(fp, "# %8s %8s %8s %s\n", "pid", "tid", "ppid", "comm"); + + list_for_each_entry(task, &tasks, list) + task__print_level(machine, task->thread, fp); + } + thread_list__delete(&tasks); + return ret; } static int __cmd_report(struct report *rep) @@ -1410,7 +1449,7 @@ int cmd_report(int argc, const char **argv) "only show processor socket that match with this filter"), OPT_BOOLEAN(0, "raw-trace", &symbol_conf.raw_trace, "Show raw trace event output (do not use print fmt or plugins)"), - OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy, + OPT_BOOLEAN('H', "hierarchy", &symbol_conf.report_hierarchy, "Show entries in a hierarchy"), OPT_CALLBACK_DEFAULT(0, "stdio-color", NULL, "mode", "'always' (default), 'never' or 'auto' only applicable to --stdio mode", @@ -1766,6 +1805,8 @@ repeat: } else ret = 0; + if (!use_browser && (verbose > 2 || debug_kmaps)) + perf_session__dump_kmaps(session); error: if (report.ptime_range) { itrace_synth_opts__clear_time_range(&itrace_synth_opts); |