From ab38e84ba9a80581e055408e0f8c0158998fa4b9 Mon Sep 17 00:00:00 2001 From: Blake Jones Date: Thu, 12 Jun 2025 12:49:36 -0700 Subject: perf record: collect BPF metadata from existing BPF programs Look for .rodata maps, find ones with 'bpf_metadata_' variables, extract their values as strings, and create a new PERF_RECORD_BPF_METADATA synthetic event using that data. The code gets invoked from the existing routine perf_event__synthesize_one_bpf_prog(). For example, a BPF program with the following variables: const char bpf_metadata_version[] SEC(".rodata") = "3.14159"; int bpf_metadata_value[] SEC(".rodata") = 42; would generate a PERF_RECORD_BPF_METADATA record with: .prog_name = .nr_entries = 2 .entries[0].key = "version" .entries[0].value = "3.14159" .entries[1].key = "value" .entries[1].value = "42" Each of the BPF programs and subprograms that share those variables would get a distinct PERF_RECORD_BPF_METADATA record, with the ".prog_name" showing the name of each program or subprogram. The prog_name is deliberately the same as the ".name" field in the corresponding PERF_RECORD_KSYMBOL record. This code only gets invoked if support for displaying BTF char arrays as strings is detected. Signed-off-by: Blake Jones Link: https://lore.kernel.org/r/20250612194939.162730-3-blakejones@google.com Signed-off-by: Namhyung Kim --- tools/lib/perf/include/perf/event.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'tools/lib') diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index 09b7c643ddac..6608f1e3701b 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -467,6 +467,22 @@ struct perf_record_compressed2 { char data[]; }; +#define BPF_METADATA_KEY_LEN 64 +#define BPF_METADATA_VALUE_LEN 256 +#define BPF_PROG_NAME_LEN KSYM_NAME_LEN + +struct perf_record_bpf_metadata_entry { + char key[BPF_METADATA_KEY_LEN]; + char value[BPF_METADATA_VALUE_LEN]; +}; + +struct perf_record_bpf_metadata { + struct perf_event_header header; + char prog_name[BPF_PROG_NAME_LEN]; + __u64 nr_entries; + struct perf_record_bpf_metadata_entry entries[]; +}; + enum perf_user_event_type { /* above any possible kernel type */ PERF_RECORD_USER_TYPE_START = 64, PERF_RECORD_HEADER_ATTR = 64, @@ -489,6 +505,7 @@ enum perf_user_event_type { /* above any possible kernel type */ PERF_RECORD_COMPRESSED = 81, PERF_RECORD_FINISHED_INIT = 82, PERF_RECORD_COMPRESSED2 = 83, + PERF_RECORD_BPF_METADATA = 84, PERF_RECORD_HEADER_MAX }; @@ -530,6 +547,7 @@ union perf_event { struct perf_record_header_feature feat; struct perf_record_compressed pack; struct perf_record_compressed2 pack2; + struct perf_record_bpf_metadata bpf_metadata; }; #endif /* __LIBPERF_EVENT_H */ -- cgit v1.2.3 From be59dba332e1e8edd3e88d991ba0e4795ae2bcb2 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 17 Jun 2025 15:33:56 -0700 Subject: libperf evsel: Add missed puts and asserts A missed evsel__close before evsel__delete was the source of leaking perf events due to a hybrid test. Add asserts in debug builds so that this shouldn't happen in the future. Add puts missing on the cpu map and thread maps. Signed-off-by: Ian Rogers Link: https://lore.kernel.org/r/20250617223356.2752099-4-irogers@google.com Signed-off-by: Namhyung Kim --- tools/lib/perf/evsel.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools/lib') diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c index c475319e2e41..2a85e0bfee1e 100644 --- a/tools/lib/perf/evsel.c +++ b/tools/lib/perf/evsel.c @@ -42,6 +42,12 @@ struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr) void perf_evsel__delete(struct perf_evsel *evsel) { + assert(evsel->fd == NULL); /* If not fds were not closed. */ + assert(evsel->mmap == NULL); /* If not munmap wasn't called. */ + assert(evsel->sample_id == NULL); /* If not free_id wasn't called. */ + perf_cpu_map__put(evsel->cpus); + perf_cpu_map__put(evsel->own_cpus); + perf_thread_map__put(evsel->threads); free(evsel); } -- cgit v1.2.3 From 1fdf938168c4d26fa279d4f204768690d1f9c4ae Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 1 Jul 2025 13:10:27 -0700 Subject: perf tools: Fix use-after-free in help_unknown_cmd() Currently perf aborts when it finds an invalid command. I guess it depends on the environment as I have some custom commands in the path. $ perf bad-command perf: 'bad-command' is not a perf-command. See 'perf --help'. Aborted (core dumped) It's because the exclude_cmds() in libsubcmd has a use-after-free when it removes some entries. After copying one to another entry, it keeps the pointer in the both position. And the next copy operation will free the later one but it's the same entry in the previous one. For example, let's say cmds = { A, B, C, D, E } and excludes = { B, E }. ci cj ei cmds-name excludes -----------+-------------------- 0 0 0 | A B : cmp < 0, ci == cj 1 1 0 | B B : cmp == 0 2 1 1 | C E : cmp < 0, ci != cj At this point, it frees cmds->names[1] and cmds->names[1] is assigned to cmds->names[2]. 3 2 1 | D E : cmp < 0, ci != cj Now it frees cmds->names[2] but it's the same as cmds->names[1]. So accessing cmds->names[1] will be invalid. This makes the subcmd tests succeed. $ perf test subcmd 69: libsubcmd help tests : 69.1: Load subcmd names : Ok 69.2: Uniquify subcmd names : Ok 69.3: Exclude duplicate subcmd names : Ok Fixes: 4b96679170c6 ("libsubcmd: Avoid SEGV/use-after-free when commands aren't excluded") Reviewed-by: Ian Rogers Link: https://lore.kernel.org/r/20250701201027.1171561-3-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/lib/subcmd/help.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'tools/lib') diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c index 8561b0f01a24..9ef569492560 100644 --- a/tools/lib/subcmd/help.c +++ b/tools/lib/subcmd/help.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "subcmd-util.h" #include "help.h" #include "exec-cmd.h" @@ -82,10 +83,11 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) ci++; cj++; } else { - zfree(&cmds->names[cj]); - cmds->names[cj++] = cmds->names[ci++]; + cmds->names[cj++] = cmds->names[ci]; + cmds->names[ci++] = NULL; } } else if (cmp == 0) { + zfree(&cmds->names[ci]); ci++; ei++; } else if (cmp > 0) { @@ -94,12 +96,12 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) } if (ci != cj) { while (ci < cmds->cnt) { - zfree(&cmds->names[cj]); - cmds->names[cj++] = cmds->names[ci++]; + cmds->names[cj++] = cmds->names[ci]; + cmds->names[ci++] = NULL; } } for (ci = cj; ci < cmds->cnt; ci++) - zfree(&cmds->names[ci]); + assert(cmds->names[ci] == NULL); cmds->cnt = cj; } -- cgit v1.2.3 From 478272d1cdd9959a6d638e9d81f70642f04290c9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 17 Jul 2025 08:08:53 -0700 Subject: tools subcmd: Tighten the filename size in check_if_command_finished FILENAME_MAX is often PATH_MAX (4kb), far more than needed for the /proc path. Make the buffer size sufficient for the maximum integer plus "/proc/" and "/status" with a '\0' terminator. Fixes: 5ce42b5de461 ("tools subcmd: Add non-waitpid check_if_command_finished()") Signed-off-by: Ian Rogers Link: https://lore.kernel.org/r/20250717150855.1032526-1-irogers@google.com Signed-off-by: Namhyung Kim --- tools/lib/subcmd/run-command.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'tools/lib') diff --git a/tools/lib/subcmd/run-command.c b/tools/lib/subcmd/run-command.c index 0a764c25c384..b7510f83209a 100644 --- a/tools/lib/subcmd/run-command.c +++ b/tools/lib/subcmd/run-command.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -216,10 +217,20 @@ static int wait_or_whine(struct child_process *cmd, bool block) return result; } +/* + * Conservative estimate of number of characaters needed to hold an a decoded + * integer, assume each 3 bits needs a character byte and plus a possible sign + * character. + */ +#ifndef is_signed_type +#define is_signed_type(type) (((type)(-1)) < (type)1) +#endif +#define MAX_STRLEN_TYPE(type) (sizeof(type) * 8 / 3 + (is_signed_type(type) ? 1 : 0)) + int check_if_command_finished(struct child_process *cmd) { #ifdef __linux__ - char filename[FILENAME_MAX + 12]; + char filename[6 + MAX_STRLEN_TYPE(typeof(cmd->pid)) + 7 + 1]; char status_line[256]; FILE *status_file; @@ -227,7 +238,7 @@ int check_if_command_finished(struct child_process *cmd) * Check by reading /proc//status as calling waitpid causes * stdout/stderr to be closed and data lost. */ - sprintf(filename, "/proc/%d/status", cmd->pid); + sprintf(filename, "/proc/%u/status", cmd->pid); status_file = fopen(filename, "r"); if (status_file == NULL) { /* Open failed assume finish_command was called. */ -- cgit v1.2.3 From 6d765f5f7ec669f2a16b44afd23cd877efa640de Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 18 Jul 2025 20:05:08 -0700 Subject: libperf evsel: Rename own_cpus to pmu_cpus own_cpus is generally the cpumask from the PMU. Rename to pmu_cpus to try to make this clearer. Variable rename with no other changes. Reviewed-by: Thomas Falcon Signed-off-by: Ian Rogers Tested-by: James Clark Link: https://lore.kernel.org/r/20250719030517.1990983-7-irogers@google.com Signed-off-by: Namhyung Kim --- tools/lib/perf/evlist.c | 8 ++++---- tools/lib/perf/evsel.c | 2 +- tools/lib/perf/include/internal/evsel.h | 2 +- tools/perf/tests/event_update.c | 4 ++-- tools/perf/util/evsel.c | 6 +++--- tools/perf/util/header.c | 4 ++-- tools/perf/util/parse-events.c | 2 +- tools/perf/util/synthetic-events.c | 4 ++-- tools/perf/util/tool_pmu.c | 12 ++++++------ 9 files changed, 22 insertions(+), 22 deletions(-) (limited to 'tools/lib') diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c index b1f4c8176b32..9d9dec21f510 100644 --- a/tools/lib/perf/evlist.c +++ b/tools/lib/perf/evlist.c @@ -46,7 +46,7 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist, * are valid by intersecting with those of the PMU. */ perf_cpu_map__put(evsel->cpus); - evsel->cpus = perf_cpu_map__intersect(evlist->user_requested_cpus, evsel->own_cpus); + evsel->cpus = perf_cpu_map__intersect(evlist->user_requested_cpus, evsel->pmu_cpus); /* * Empty cpu lists would eventually get opened as "any" so remove @@ -61,7 +61,7 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist, list_for_each_entry_from(next, &evlist->entries, node) next->idx--; } - } else if (!evsel->own_cpus || evlist->has_user_cpus || + } else if (!evsel->pmu_cpus || evlist->has_user_cpus || (!evsel->requires_cpu && perf_cpu_map__has_any_cpu(evlist->user_requested_cpus))) { /* * The PMU didn't specify a default cpu map, this isn't a core @@ -72,13 +72,13 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist, */ perf_cpu_map__put(evsel->cpus); evsel->cpus = perf_cpu_map__get(evlist->user_requested_cpus); - } else if (evsel->cpus != evsel->own_cpus) { + } else if (evsel->cpus != evsel->pmu_cpus) { /* * No user requested cpu map but the PMU cpu map doesn't match * the evsel's. Reset it back to the PMU cpu map. */ perf_cpu_map__put(evsel->cpus); - evsel->cpus = perf_cpu_map__get(evsel->own_cpus); + evsel->cpus = perf_cpu_map__get(evsel->pmu_cpus); } if (evsel->system_wide) { diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c index 2a85e0bfee1e..127abe7df63d 100644 --- a/tools/lib/perf/evsel.c +++ b/tools/lib/perf/evsel.c @@ -46,7 +46,7 @@ void perf_evsel__delete(struct perf_evsel *evsel) assert(evsel->mmap == NULL); /* If not munmap wasn't called. */ assert(evsel->sample_id == NULL); /* If not free_id wasn't called. */ perf_cpu_map__put(evsel->cpus); - perf_cpu_map__put(evsel->own_cpus); + perf_cpu_map__put(evsel->pmu_cpus); perf_thread_map__put(evsel->threads); free(evsel); } diff --git a/tools/lib/perf/include/internal/evsel.h b/tools/lib/perf/include/internal/evsel.h index ea78defa77d0..b97dc8c92882 100644 --- a/tools/lib/perf/include/internal/evsel.h +++ b/tools/lib/perf/include/internal/evsel.h @@ -99,7 +99,7 @@ struct perf_evsel { * cpu map for opening the event on, for example, the first CPU on a * socket for an uncore event. */ - struct perf_cpu_map *own_cpus; + struct perf_cpu_map *pmu_cpus; struct perf_thread_map *threads; struct xyarray *fd; struct xyarray *mmap; diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index 9301fde11366..cb9e6de2e033 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -109,8 +109,8 @@ static int test__event_update(struct test_suite *test __maybe_unused, int subtes TEST_ASSERT_VAL("failed to synthesize attr update name", !perf_event__synthesize_event_update_name(&tmp.tool, evsel, process_event_name)); - perf_cpu_map__put(evsel->core.own_cpus); - evsel->core.own_cpus = perf_cpu_map__new("1,2,3"); + perf_cpu_map__put(evsel->core.pmu_cpus); + evsel->core.pmu_cpus = perf_cpu_map__new("1,2,3"); TEST_ASSERT_VAL("failed to synthesize attr update cpus", !perf_event__synthesize_event_update_cpus(&tmp.tool, evsel, process_event_cpus)); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index d9b6bf78d67b..ba0c9799928b 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -488,7 +488,7 @@ struct evsel *evsel__clone(struct evsel *dest, struct evsel *orig) return NULL; evsel->core.cpus = perf_cpu_map__get(orig->core.cpus); - evsel->core.own_cpus = perf_cpu_map__get(orig->core.own_cpus); + evsel->core.pmu_cpus = perf_cpu_map__get(orig->core.pmu_cpus); evsel->core.threads = perf_thread_map__get(orig->core.threads); evsel->core.nr_members = orig->core.nr_members; evsel->core.system_wide = orig->core.system_wide; @@ -1527,7 +1527,7 @@ void evsel__config(struct evsel *evsel, struct record_opts *opts, attr->exclude_user = 1; } - if (evsel->core.own_cpus || evsel->unit) + if (evsel->core.pmu_cpus || evsel->unit) evsel->core.attr.read_format |= PERF_FORMAT_ID; /* @@ -1680,7 +1680,7 @@ void evsel__exit(struct evsel *evsel) evsel__free_config_terms(evsel); cgroup__put(evsel->cgrp); perf_cpu_map__put(evsel->core.cpus); - perf_cpu_map__put(evsel->core.own_cpus); + perf_cpu_map__put(evsel->core.pmu_cpus); perf_thread_map__put(evsel->core.threads); zfree(&evsel->group_name); zfree(&evsel->name); diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 53d54fbda10d..d941d7aa0f49 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -4507,8 +4507,8 @@ int perf_event__process_event_update(const struct perf_tool *tool __maybe_unused case PERF_EVENT_UPDATE__CPUS: map = cpu_map__new_data(&ev->cpus.cpus); if (map) { - perf_cpu_map__put(evsel->core.own_cpus); - evsel->core.own_cpus = map; + perf_cpu_map__put(evsel->core.pmu_cpus); + evsel->core.pmu_cpus = map; } else pr_err("failed to get event_update cpus\n"); default: diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index a337e4d22ff2..d506f9943506 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -320,7 +320,7 @@ __add_event(struct list_head *list, int *idx, (*idx)++; evsel->core.cpus = cpus; - evsel->core.own_cpus = perf_cpu_map__get(cpus); + evsel->core.pmu_cpus = perf_cpu_map__get(cpus); evsel->core.requires_cpu = pmu ? pmu->is_uncore : false; evsel->core.is_pmu_core = is_pmu_core; evsel->pmu = pmu; diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 2fc4d0537840..7c00b09e3a93 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -2045,7 +2045,7 @@ int perf_event__synthesize_event_update_name(const struct perf_tool *tool, struc int perf_event__synthesize_event_update_cpus(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process) { - struct synthesize_cpu_map_data syn_data = { .map = evsel->core.own_cpus }; + struct synthesize_cpu_map_data syn_data = { .map = evsel->core.pmu_cpus }; struct perf_record_event_update *ev; int err; @@ -2126,7 +2126,7 @@ int perf_event__synthesize_extra_attr(const struct perf_tool *tool, struct evlis } } - if (evsel->core.own_cpus) { + if (evsel->core.pmu_cpus) { err = perf_event__synthesize_event_update_cpus(tool, evsel, process); if (err < 0) { pr_err("Couldn't synthesize evsel cpus.\n"); diff --git a/tools/perf/util/tool_pmu.c b/tools/perf/util/tool_pmu.c index 7aa4f315b0ac..d99e699e646d 100644 --- a/tools/perf/util/tool_pmu.c +++ b/tools/perf/util/tool_pmu.c @@ -357,10 +357,10 @@ bool tool_pmu__read_event(enum tool_pmu_event ev, struct evsel *evsel, u64 *resu /* * "Any CPU" event that can be scheduled on any CPU in * the PMU's cpumask. The PMU cpumask should be saved in - * own_cpus. If not present fall back to max. + * pmu_cpus. If not present fall back to max. */ - if (!perf_cpu_map__is_empty(evsel->core.own_cpus)) - *result = perf_cpu_map__nr(evsel->core.own_cpus); + if (!perf_cpu_map__is_empty(evsel->core.pmu_cpus)) + *result = perf_cpu_map__nr(evsel->core.pmu_cpus); else *result = cpu__max_present_cpu().cpu; } @@ -386,12 +386,12 @@ bool tool_pmu__read_event(enum tool_pmu_event ev, struct evsel *evsel, u64 *resu /* * "Any CPU" event that can be scheduled on any CPU in * the PMU's cpumask. The PMU cpumask should be saved in - * own_cpus, if not present then just the online cpu + * pmu_cpus, if not present then just the online cpu * mask. */ - if (!perf_cpu_map__is_empty(evsel->core.own_cpus)) { + if (!perf_cpu_map__is_empty(evsel->core.pmu_cpus)) { struct perf_cpu_map *tmp = - perf_cpu_map__intersect(online, evsel->core.own_cpus); + perf_cpu_map__intersect(online, evsel->core.pmu_cpus); *result = perf_cpu_map__nr(tmp); perf_cpu_map__put(tmp); -- cgit v1.2.3 From 9a711ef3bd57c124cb7255a4bb8a5166c6b0cef0 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 18 Jul 2025 20:05:09 -0700 Subject: libperf evsel: Factor perf_evsel__exit out of perf_evsel__delete This allows the perf_evsel__exit to be called when the struct perf_evsel is embedded inside another struct, such as struct evsel in perf. Reviewed-by: Thomas Falcon Signed-off-by: Ian Rogers Tested-by: James Clark Link: https://lore.kernel.org/r/20250719030517.1990983-8-irogers@google.com Signed-off-by: Namhyung Kim --- tools/lib/perf/evsel.c | 7 ++++++- tools/lib/perf/include/internal/evsel.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'tools/lib') diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c index 127abe7df63d..13a307fc75ae 100644 --- a/tools/lib/perf/evsel.c +++ b/tools/lib/perf/evsel.c @@ -40,7 +40,7 @@ struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr) return evsel; } -void perf_evsel__delete(struct perf_evsel *evsel) +void perf_evsel__exit(struct perf_evsel *evsel) { assert(evsel->fd == NULL); /* If not fds were not closed. */ assert(evsel->mmap == NULL); /* If not munmap wasn't called. */ @@ -48,6 +48,11 @@ void perf_evsel__delete(struct perf_evsel *evsel) perf_cpu_map__put(evsel->cpus); perf_cpu_map__put(evsel->pmu_cpus); perf_thread_map__put(evsel->threads); +} + +void perf_evsel__delete(struct perf_evsel *evsel) +{ + perf_evsel__exit(evsel); free(evsel); } diff --git a/tools/lib/perf/include/internal/evsel.h b/tools/lib/perf/include/internal/evsel.h index b97dc8c92882..fefe64ba5e26 100644 --- a/tools/lib/perf/include/internal/evsel.h +++ b/tools/lib/perf/include/internal/evsel.h @@ -133,6 +133,7 @@ struct perf_evsel { void perf_evsel__init(struct perf_evsel *evsel, struct perf_event_attr *attr, int idx); +void perf_evsel__exit(struct perf_evsel *evsel); int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads); void perf_evsel__close_fd(struct perf_evsel *evsel); void perf_evsel__free_fd(struct perf_evsel *evsel); -- cgit v1.2.3 From 811082e4b668db9689f8ce927a106036b4ed4e96 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 18 Jul 2025 20:05:14 -0700 Subject: perf parse-events: Support user CPUs mixed with threads/processes Counting events system-wide with a specified CPU prior to this change worked: ``` $ perf stat -e 'msr/tsc/,msr/tsc,cpu=cpu_core/,msr/tsc,cpu=cpu_atom/' -a sleep 1 Performance counter stats for 'system wide': 59,393,419,099 msr/tsc/ 33,927,965,927 msr/tsc,cpu=cpu_core/ 25,465,608,044 msr/tsc,cpu=cpu_atom/ ``` However, when counting with process the counts became system wide: ``` $ perf stat -e 'msr/tsc/,msr/tsc,cpu=cpu_core/,msr/tsc,cpu=cpu_atom/' perf test -F 10 10.1: Basic parsing test : Ok 10.2: Parsing without PMU name : Ok 10.3: Parsing with PMU name : Ok Performance counter stats for 'perf test -F 10': 59,233,549 msr/tsc/ 59,227,556 msr/tsc,cpu=cpu_core/ 59,224,053 msr/tsc,cpu=cpu_atom/ ``` Make the handling of CPU maps with event parsing clearer. When an event is parsed creating an evsel the cpus should be either the PMU's cpumask or user specified CPUs. Update perf_evlist__propagate_maps so that it doesn't clobber the user specified CPUs. Try to make the behavior clearer, firstly fix up missing cpumasks. Next, perform sanity checks and adjustments from the global evlist CPU requests and for the PMU including simplifying to the "any CPU"(-1) value. Finally remove the event if the cpumask is empty. So that events are opened with a CPU and a thread change stat's create_perf_stat_counter to give both. With the change things are fixed: ``` $ perf stat --no-scale -e 'msr/tsc/,msr/tsc,cpu=cpu_core/,msr/tsc,cpu=cpu_atom/' perf test -F 10 10.1: Basic parsing test : Ok 10.2: Parsing without PMU name : Ok 10.3: Parsing with PMU name : Ok Performance counter stats for 'perf test -F 10': 63,704,975 msr/tsc/ 47,060,704 msr/tsc,cpu=cpu_core/ (4.62%) 16,640,591 msr/tsc,cpu=cpu_atom/ (2.18%) ``` However, note the "--no-scale" option is used. This is necessary as the running time for the event on the counter isn't the same as the enabled time because the thread doesn't necessarily run on the CPUs specified for the counter. All counter values are scaled with: scaled_value = value * time_enabled / time_running and so without --no-scale the scaled_value becomes very large. This problem already exists on hybrid systems for the same reason. Here are 2 runs of the same code with an instructions event that counts the same on both types of core, there is no real multiplexing happening on the event: ``` $ perf stat -e instructions perf test -F 10 ... Performance counter stats for 'perf test -F 10': 87,896,447 cpu_atom/instructions/ (14.37%) 98,171,964 cpu_core/instructions/ (85.63%) ... $ perf stat --no-scale -e instructions perf test -F 10 ... Performance counter stats for 'perf test -F 10': 13,069,890 cpu_atom/instructions/ (19.32%) 83,460,274 cpu_core/instructions/ (80.68%) ... ``` The scaling has inflated per-PMU instruction counts and the overall count by 2x. To fix this the kernel needs changing when a task+CPU event (or just task event on hybrid) is scheduled out. A fix could be that the state isn't inactive but off for such events, so that time_enabled counts don't accumulate on them. Reviewed-by: Thomas Falcon Signed-off-by: Ian Rogers Link: https://lore.kernel.org/r/20250719030517.1990983-13-irogers@google.com Signed-off-by: Namhyung Kim --- tools/lib/perf/evlist.c | 119 ++++++++++++++++++++++++++++------------- tools/perf/util/parse-events.c | 10 ++-- tools/perf/util/stat.c | 6 +-- 3 files changed, 87 insertions(+), 48 deletions(-) (limited to 'tools/lib') diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c index 9d9dec21f510..3ed023f4b190 100644 --- a/tools/lib/perf/evlist.c +++ b/tools/lib/perf/evlist.c @@ -36,49 +36,88 @@ void perf_evlist__init(struct perf_evlist *evlist) static void __perf_evlist__propagate_maps(struct perf_evlist *evlist, struct perf_evsel *evsel) { - if (evsel->system_wide) { - /* System wide: set the cpu map of the evsel to all online CPUs. */ - perf_cpu_map__put(evsel->cpus); - evsel->cpus = perf_cpu_map__new_online_cpus(); - } else if (evlist->has_user_cpus && evsel->is_pmu_core) { - /* - * User requested CPUs on a core PMU, ensure the requested CPUs - * are valid by intersecting with those of the PMU. - */ + if (perf_cpu_map__is_empty(evsel->cpus)) { + if (perf_cpu_map__is_empty(evsel->pmu_cpus)) { + /* + * Assume the unset PMU cpus were for a system-wide + * event, like a software or tracepoint. + */ + evsel->pmu_cpus = perf_cpu_map__new_online_cpus(); + } + if (evlist->has_user_cpus && !evsel->system_wide) { + /* + * Use the user CPUs unless the evsel is set to be + * system wide, such as the dummy event. + */ + evsel->cpus = perf_cpu_map__get(evlist->user_requested_cpus); + } else { + /* + * System wide and other modes, assume the cpu map + * should be set to all PMU CPUs. + */ + evsel->cpus = perf_cpu_map__get(evsel->pmu_cpus); + } + } + /* + * Avoid "any CPU"(-1) for uncore and PMUs that require a CPU, even if + * requested. + */ + if (evsel->requires_cpu && perf_cpu_map__has_any_cpu(evsel->cpus)) { perf_cpu_map__put(evsel->cpus); - evsel->cpus = perf_cpu_map__intersect(evlist->user_requested_cpus, evsel->pmu_cpus); + evsel->cpus = perf_cpu_map__get(evsel->pmu_cpus); + } - /* - * Empty cpu lists would eventually get opened as "any" so remove - * genuinely empty ones before they're opened in the wrong place. - */ - if (perf_cpu_map__is_empty(evsel->cpus)) { - struct perf_evsel *next = perf_evlist__next(evlist, evsel); - - perf_evlist__remove(evlist, evsel); - /* Keep idx contiguous */ - if (next) - list_for_each_entry_from(next, &evlist->entries, node) - next->idx--; + /* + * Globally requested CPUs replace user requested unless the evsel is + * set to be system wide. + */ + if (evlist->has_user_cpus && !evsel->system_wide) { + assert(!perf_cpu_map__has_any_cpu(evlist->user_requested_cpus)); + if (!perf_cpu_map__equal(evsel->cpus, evlist->user_requested_cpus)) { + perf_cpu_map__put(evsel->cpus); + evsel->cpus = perf_cpu_map__get(evlist->user_requested_cpus); } - } else if (!evsel->pmu_cpus || evlist->has_user_cpus || - (!evsel->requires_cpu && perf_cpu_map__has_any_cpu(evlist->user_requested_cpus))) { - /* - * The PMU didn't specify a default cpu map, this isn't a core - * event and the user requested CPUs or the evlist user - * requested CPUs have the "any CPU" (aka dummy) CPU value. In - * which case use the user requested CPUs rather than the PMU - * ones. - */ + } + + /* Ensure cpus only references valid PMU CPUs. */ + if (!perf_cpu_map__has_any_cpu(evsel->cpus) && + !perf_cpu_map__is_subset(evsel->pmu_cpus, evsel->cpus)) { + struct perf_cpu_map *tmp = perf_cpu_map__intersect(evsel->pmu_cpus, evsel->cpus); + perf_cpu_map__put(evsel->cpus); - evsel->cpus = perf_cpu_map__get(evlist->user_requested_cpus); - } else if (evsel->cpus != evsel->pmu_cpus) { - /* - * No user requested cpu map but the PMU cpu map doesn't match - * the evsel's. Reset it back to the PMU cpu map. - */ + evsel->cpus = tmp; + } + + /* + * Was event requested on all the PMU's CPUs but the user requested is + * any CPU (-1)? If so switch to using any CPU (-1) to reduce the number + * of events. + */ + if (!evsel->system_wide && + !evsel->requires_cpu && + perf_cpu_map__equal(evsel->cpus, evsel->pmu_cpus) && + perf_cpu_map__has_any_cpu(evlist->user_requested_cpus)) { perf_cpu_map__put(evsel->cpus); - evsel->cpus = perf_cpu_map__get(evsel->pmu_cpus); + evsel->cpus = perf_cpu_map__get(evlist->user_requested_cpus); + } + + /* Sanity check assert before the evsel is potentially removed. */ + assert(!evsel->requires_cpu || !perf_cpu_map__has_any_cpu(evsel->cpus)); + + /* + * Empty cpu lists would eventually get opened as "any" so remove + * genuinely empty ones before they're opened in the wrong place. + */ + if (perf_cpu_map__is_empty(evsel->cpus)) { + struct perf_evsel *next = perf_evlist__next(evlist, evsel); + + perf_evlist__remove(evlist, evsel); + /* Keep idx contiguous */ + if (next) + list_for_each_entry_from(next, &evlist->entries, node) + next->idx--; + + return; } if (evsel->system_wide) { @@ -98,6 +137,10 @@ static void perf_evlist__propagate_maps(struct perf_evlist *evlist) evlist->needs_map_propagation = true; + /* Clear the all_cpus set which will be merged into during propagation. */ + perf_cpu_map__put(evlist->all_cpus); + evlist->all_cpus = NULL; + list_for_each_entry_safe(evsel, n, &evlist->entries, node) __perf_evlist__propagate_maps(evlist, evsel); } diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index bd2d831d5123..fe2073c6b549 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -310,20 +310,18 @@ __add_event(struct list_head *list, int *idx, if (pmu) { is_pmu_core = pmu->is_core; pmu_cpus = perf_cpu_map__get(pmu->cpus); + if (perf_cpu_map__is_empty(pmu_cpus)) + pmu_cpus = cpu_map__online(); } else { is_pmu_core = (attr->type == PERF_TYPE_HARDWARE || attr->type == PERF_TYPE_HW_CACHE); pmu_cpus = is_pmu_core ? cpu_map__online() : NULL; } - if (has_user_cpus) { + if (has_user_cpus) cpus = perf_cpu_map__get(user_cpus); - /* Existing behavior that pmu_cpus matches the given user ones. */ - perf_cpu_map__put(pmu_cpus); - pmu_cpus = perf_cpu_map__get(user_cpus); - } else { + else cpus = perf_cpu_map__get(pmu_cpus); - } if (init_attr) event_attr_init(attr); diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index b0205e99a4c9..50b1a92d16df 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -769,8 +769,6 @@ int create_perf_stat_counter(struct evsel *evsel, attr->enable_on_exec = 1; } - if (target__has_cpu(target) && !target__has_per_thread(target)) - return evsel__open_per_cpu(evsel, evsel__cpus(evsel), cpu_map_idx); - - return evsel__open_per_thread(evsel, evsel->core.threads); + return evsel__open_per_cpu_and_thread(evsel, evsel__cpus(evsel), cpu_map_idx, + evsel->core.threads); } -- cgit v1.2.3