From d24fa977eec53399a9a49a2e1dc592430ea0a607 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sun, 30 Mar 2025 12:34:47 +0900 Subject: tracing: fprobe: Fix to lock module while registering fprobe Since register_fprobe() does not get the module reference count while registering fgraph filter, if the target functions (symbols) are in modules, those modules can be unloaded when registering fprobe to fgraph. To avoid this issue, get the reference counter of module for each symbol, and put it after register the fprobe. Link: https://lore.kernel.org/all/174330568792.459674.16874380163991113156.stgit@devnote2/ Reported-by: Steven Rostedt Closes: https://lore.kernel.org/all/20250325130628.3a9e234c@gandalf.local.home/ Fixes: 4346ba160409 ("fprobe: Rewrite fprobe on function-graph tracer") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/fprobe.c | 67 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 19 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 33082c4e8154..cb86f90d4b1e 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -445,6 +445,7 @@ struct filter_match_data { size_t index; size_t size; unsigned long *addrs; + struct module **mods; }; static int filter_match_callback(void *data, const char *name, unsigned long addr) @@ -458,30 +459,47 @@ static int filter_match_callback(void *data, const char *name, unsigned long add if (!ftrace_location(addr)) return 0; - if (match->addrs) - match->addrs[match->index] = addr; + if (match->addrs) { + struct module *mod = __module_text_address(addr); + + if (mod && !try_module_get(mod)) + return 0; + match->mods[match->index] = mod; + match->addrs[match->index] = addr; + } match->index++; return match->index == match->size; } /* * Make IP list from the filter/no-filter glob patterns. - * Return the number of matched symbols, or -ENOENT. + * Return the number of matched symbols, or errno. + * If @addrs == NULL, this just counts the number of matched symbols. If @addrs + * is passed with an array, we need to pass the an @mods array of the same size + * to increment the module refcount for each symbol. + * This means we also need to call `module_put` for each element of @mods after + * using the @addrs. */ -static int ip_list_from_filter(const char *filter, const char *notfilter, - unsigned long *addrs, size_t size) +static int get_ips_from_filter(const char *filter, const char *notfilter, + unsigned long *addrs, struct module **mods, + size_t size) { struct filter_match_data match = { .filter = filter, .notfilter = notfilter, - .index = 0, .size = size, .addrs = addrs}; + .index = 0, .size = size, .addrs = addrs, .mods = mods}; int ret; + if (addrs && !mods) + return -EINVAL; + ret = kallsyms_on_each_symbol(filter_match_callback, &match); if (ret < 0) return ret; - ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match); - if (ret < 0) - return ret; + if (IS_ENABLED(CONFIG_MODULES)) { + ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match); + if (ret < 0) + return ret; + } return match.index ?: -ENOENT; } @@ -543,24 +561,35 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num) */ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) { - unsigned long *addrs; - int ret; + unsigned long *addrs __free(kfree) = NULL; + struct module **mods __free(kfree) = NULL; + int ret, num; if (!fp || !filter) return -EINVAL; - ret = ip_list_from_filter(filter, notfilter, NULL, FPROBE_IPS_MAX); - if (ret < 0) - return ret; + num = get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX); + if (num < 0) + return num; - addrs = kcalloc(ret, sizeof(unsigned long), GFP_KERNEL); + addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL); if (!addrs) return -ENOMEM; - ret = ip_list_from_filter(filter, notfilter, addrs, ret); - if (ret > 0) - ret = register_fprobe_ips(fp, addrs, ret); - kfree(addrs); + mods = kcalloc(num, sizeof(*mods), GFP_KERNEL); + if (!mods) + return -ENOMEM; + + ret = get_ips_from_filter(filter, notfilter, addrs, mods, num); + if (ret < 0) + return ret; + + ret = register_fprobe_ips(fp, addrs, ret); + + for (int i = 0; i < num; i++) { + if (mods[i]) + module_put(mods[i]); + } return ret; } EXPORT_SYMBOL_GPL(register_fprobe); -- cgit v1.2.3 From dd941507a9486252d6fcf11814387666792020f3 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 31 Mar 2025 23:05:07 +0900 Subject: tracing: fprobe events: Fix possible UAF on modules Commit ac91052f0ae5 ("tracing: tprobe-events: Fix leakage of module refcount") moved try_module_get() from __find_tracepoint_module_cb() to find_tracepoint() caller, but that introduced a possible UAF because the module can be unloaded before try_module_get(). In this case, the module object should be freed too. Thus, try_module_get() does not only fail but may access to the freed object. To avoid that, try_module_get() in __find_tracepoint_module_cb() again. Link: https://lore.kernel.org/all/174342990779.781946.9138388479067729366.stgit@devnote2/ Fixes: ac91052f0ae5 ("tracing: tprobe-events: Fix leakage of module refcount") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_fprobe.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 985ff98272da..2cd9ff1049f1 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -919,9 +919,15 @@ static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mo struct __find_tracepoint_cb_data *data = priv; if (!data->tpoint && !strcmp(data->tp_name, tp->name)) { - data->tpoint = tp; - if (!data->mod) + /* If module is not specified, try getting module refcount. */ + if (!data->mod && mod) { + /* If failed to get refcount, ignore this tracepoint. */ + if (!try_module_get(mod)) + return; + data->mod = mod; + } + data->tpoint = tp; } } @@ -933,7 +939,11 @@ static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) data->tpoint = tp; } -/* Find a tracepoint from kernel and module. */ +/* + * Find a tracepoint from kernel and module. If the tracepoint is on the module, + * the module's refcount is incremented and returned as *@tp_mod. Thus, if it is + * not NULL, caller must call module_put(*tp_mod) after used the tracepoint. + */ static struct tracepoint *find_tracepoint(const char *tp_name, struct module **tp_mod) { @@ -962,7 +972,10 @@ static void reenable_trace_fprobe(struct trace_fprobe *tf) } } -/* Find a tracepoint from specified module. */ +/* + * Find a tracepoint from specified module. In this case, this does not get the + * module's refcount. The caller must ensure the module is not freed. + */ static struct tracepoint *find_tracepoint_in_module(struct module *mod, const char *tp_name) { @@ -1169,11 +1182,6 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], if (is_tracepoint) { ctx->flags |= TPARG_FL_TPOINT; tpoint = find_tracepoint(symbol, &tp_mod); - /* lock module until register this tprobe. */ - if (tp_mod && !try_module_get(tp_mod)) { - tpoint = NULL; - tp_mod = NULL; - } if (tpoint) { ctx->funcname = kallsyms_lookup( (unsigned long)tpoint->probestub, -- cgit v1.2.3 From 4808595a9922e89726ec5611d7749b63966b7fa8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Apr 2025 12:01:11 -0400 Subject: tracing: Hide get_vm_area() from MMUless builds The function get_vm_area() is not defined for non-MMU builds and causes a build error if it is used. Hide the map_pages() function around a: #ifdef CONFIG_MMU to keep it from being compiled when CONFIG_MMU is not set. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250407120111.2ccc9319@gandalf.local.home Reported-by: Guenter Roeck Tested-by: Guenter Roeck Closes: https://lore.kernel.org/all/4f8ece8b-8862-4f7c-8ede-febd28f8a9fe@roeck-us.net/ Fixes: 394f3f02de531 ("tracing: Use vmap_page_range() to map memmap ring buffer") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b581e388a9d9..8ddf6b17215c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9806,6 +9806,7 @@ static int instance_mkdir(const char *name) return ret; } +#ifdef CONFIG_MMU static u64 map_pages(unsigned long start, unsigned long size) { unsigned long vmap_start, vmap_end; @@ -9828,6 +9829,12 @@ static u64 map_pages(unsigned long start, unsigned long size) return (u64)vmap_start; } +#else +static inline u64 map_pages(unsigned long start, unsigned long size) +{ + return 0; +} +#endif /** * trace_array_get_by_name - Create/Lookup a trace array, given its name. -- cgit v1.2.3 From a3dc2983ca7b90fd35f978502de6d4664d965cfb Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 1 Apr 2025 00:35:44 +0900 Subject: tracing: fprobe: Cleanup fprobe hash when module unloading Cleanup fprobe address hash table on module unloading because the target symbols will be disappeared when unloading module and not sure the same symbol is mapped on the same address. Note that this is at least disables the fprobes if a part of target symbols on the unloaded modules. Unlike kprobes, fprobe does not re-enable the probe point by itself. To do that, the caller should take care register/unregister fprobe when loading/unloading modules. This simplifies the fprobe state managememt related to the module loading/unloading. Link: https://lore.kernel.org/all/174343534473.843280.13988101014957210732.stgit@devnote2/ Fixes: 4346ba160409 ("fprobe: Rewrite fprobe on function-graph tracer") Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/fprobe.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index cb86f90d4b1e..95c6e3473a76 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -89,8 +89,11 @@ static bool delete_fprobe_node(struct fprobe_hlist_node *node) { lockdep_assert_held(&fprobe_mutex); - WRITE_ONCE(node->fp, NULL); - hlist_del_rcu(&node->hlist); + /* Avoid double deleting */ + if (READ_ONCE(node->fp) != NULL) { + WRITE_ONCE(node->fp, NULL); + hlist_del_rcu(&node->hlist); + } return !!find_first_fprobe_node(node->addr); } @@ -411,6 +414,102 @@ static void fprobe_graph_remove_ips(unsigned long *addrs, int num) ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); } +#ifdef CONFIG_MODULES + +#define FPROBE_IPS_BATCH_INIT 8 +/* instruction pointer address list */ +struct fprobe_addr_list { + int index; + int size; + unsigned long *addrs; +}; + +static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long addr) +{ + unsigned long *addrs; + + if (alist->index >= alist->size) + return -ENOMEM; + + alist->addrs[alist->index++] = addr; + if (alist->index < alist->size) + return 0; + + /* Expand the address list */ + addrs = kcalloc(alist->size * 2, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return -ENOMEM; + + memcpy(addrs, alist->addrs, alist->size * sizeof(*addrs)); + alist->size *= 2; + kfree(alist->addrs); + alist->addrs = addrs; + + return 0; +} + +static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head, + struct fprobe_addr_list *alist) +{ + struct fprobe_hlist_node *node; + int ret = 0; + + hlist_for_each_entry_rcu(node, head, hlist) { + if (!within_module(node->addr, mod)) + continue; + if (delete_fprobe_node(node)) + continue; + /* + * If failed to update alist, just continue to update hlist. + * Therefore, at list user handler will not hit anymore. + */ + if (!ret) + ret = fprobe_addr_list_add(alist, node->addr); + } +} + +/* Handle module unloading to manage fprobe_ip_table. */ +static int fprobe_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT}; + struct module *mod = data; + int i; + + if (val != MODULE_STATE_GOING) + return NOTIFY_DONE; + + alist.addrs = kcalloc(alist.size, sizeof(*alist.addrs), GFP_KERNEL); + /* If failed to alloc memory, we can not remove ips from hash. */ + if (!alist.addrs) + return NOTIFY_DONE; + + mutex_lock(&fprobe_mutex); + for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++) + fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist); + + if (alist.index < alist.size && alist.index > 0) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, + alist.addrs, alist.index, 1, 0); + mutex_unlock(&fprobe_mutex); + + kfree(alist.addrs); + + return NOTIFY_DONE; +} + +static struct notifier_block fprobe_module_nb = { + .notifier_call = fprobe_module_callback, + .priority = 0, +}; + +static int __init init_fprobe_module(void) +{ + return register_module_notifier(&fprobe_module_nb); +} +early_initcall(init_fprobe_module); +#endif + static int symbols_cmp(const void *a, const void *b) { const char **str_a = (const char **) a; -- cgit v1.2.3 From e1a453a57bc76be678bd746f84e3d73f378a9511 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Apr 2025 15:41:39 -0400 Subject: tracing: Do not add length to print format in synthetic events The following causes a vsnprintf fault: # echo 's:wake_lat char[] wakee; u64 delta;' >> /sys/kernel/tracing/dynamic_events # echo 'hist:keys=pid:ts=common_timestamp.usecs if !(common_flags & 0x18)' > /sys/kernel/tracing/events/sched/sched_waking/trigger # echo 'hist:keys=next_pid:delta=common_timestamp.usecs-$ts:onmatch(sched.sched_waking).trace(wake_lat,next_comm,$delta)' > /sys/kernel/tracing/events/sched/sched_switch/trigger Because the synthetic event's "wakee" field is created as a dynamic string (even though the string copied is not). The print format to print the dynamic string changed from "%*s" to "%s" because another location (__set_synth_event_print_fmt()) exported this to user space, and user space did not need that. But it is still used in print_synth_event(), and the output looks like: -0 [001] d..5. 193.428167: wake_lat: wakee=(efault)sshd-sessiondelta=155 sshd-session-879 [001] d..5. 193.811080: wake_lat: wakee=(efault)kworker/u34:5delta=58 -0 [002] d..5. 193.811198: wake_lat: wakee=(efault)bashdelta=91 bash-880 [002] d..5. 193.811371: wake_lat: wakee=(efault)kworker/u35:2delta=21 -0 [001] d..5. 193.811516: wake_lat: wakee=(efault)sshd-sessiondelta=129 sshd-session-879 [001] d..5. 193.967576: wake_lat: wakee=(efault)kworker/u34:5delta=50 The length isn't needed as the string is always nul terminated. Just print the string and not add the length (which was hard coded to the max string length anyway). Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Cc: Tom Zanussi Cc: Douglas Raillard Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/20250407154139.69955768@gandalf.local.home Fixes: 4d38328eb442d ("tracing: Fix synth event printk format for str fields"); Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 969f48742d72..33cfbd4ed76d 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -370,7 +370,6 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, union trace_synth_field *data = &entry->fields[n_u64]; trace_seq_printf(s, print_fmt, se->fields[i]->name, - STR_VAR_LEN_MAX, (char *)entry + data->as_dynamic.offset, i == se->n_fields - 1 ? "" : " "); n_u64++; -- cgit v1.2.3 From 04a80a34c22f4db245f553d8696d1318d1c00ece Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Wed, 9 Apr 2025 00:02:57 +0800 Subject: ftrace: Properly merge notrace hashes The global notrace hash should be jointly decided by the intersection of each subops's notrace hash, but not the filter hash. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20250408160258.48563-1-andybnac@gmail.com Fixes: 5fccc7552ccb ("ftrace: Add subops logic to allow one ops to manage many") Signed-off-by: Andy Chiu [ fixed removing of freeing of filter_hash ] Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1a48aedb5255..8939eeebb02e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3526,16 +3526,16 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int ftrace_hash_empty(subops->func_hash->notrace_hash)) { notrace_hash = EMPTY_HASH; } else { - size_bits = max(ops->func_hash->filter_hash->size_bits, - subops->func_hash->filter_hash->size_bits); + size_bits = max(ops->func_hash->notrace_hash->size_bits, + subops->func_hash->notrace_hash->size_bits); notrace_hash = alloc_ftrace_hash(size_bits); if (!notrace_hash) { free_ftrace_hash(filter_hash); return -ENOMEM; } - ret = intersect_hash(¬race_hash, ops->func_hash->filter_hash, - subops->func_hash->filter_hash); + ret = intersect_hash(¬race_hash, ops->func_hash->notrace_hash, + subops->func_hash->notrace_hash); if (ret < 0) { free_ftrace_hash(filter_hash); free_ftrace_hash(notrace_hash); -- cgit v1.2.3 From 0ae6b8ce200da00a78f33c055fdc4fe3225d22ec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Apr 2025 11:15:50 -0400 Subject: ftrace: Fix accounting of subop hashes The function graph infrastructure uses ftrace to hook to functions. It has a single ftrace_ops to manage all the users of function graph. Each individual user (tracing, bpf, fprobes, etc) has its own ftrace_ops to track the functions it will have its callback called from. These ftrace_ops are "subops" to the main ftrace_ops of the function graph infrastructure. Each ftrace_ops has a filter_hash and a notrace_hash that is defined as: Only trace functions that are in the filter_hash but not in the notrace_hash. If the filter_hash is empty, it means to trace all functions. If the notrace_hash is empty, it means do not disable any function. The function graph main ftrace_ops needs to be a superset containing all the functions to be traced by all the subops it has. The algorithm to perform this merge was incorrect. When the first subops was added to the main ops, it simply made the main ops a copy of the subops (same filter_hash and notrace_hash). When a second ops was added, it joined the new subops filter_hash with the main ops filter_hash as a union of the two sets. The intersect between the new subops notrace_hash and the main ops notrace_hash was created as the new notrace_hash of the main ops. The issue here is that it would then start tracing functions than no subops were tracing. For example if you had two subops that had: subops 1: filter_hash = '*sched*' # trace all functions with "sched" in it notrace_hash = '*time*' # except do not trace functions with "time" subops 2: filter_hash = '*lock*' # trace all functions with "lock" in it notrace_hash = '*clock*' # except do not trace functions with "clock" The intersect of '*time*' functions with '*clock*' functions could be the empty set. That means the main ops will be tracing all functions with '*time*' and all "*clock*" in it! Instead, modify the algorithm to be a bit simpler and correct. First, when adding a new subops, even if it's the first one, do not add the notrace_hash if the filter_hash is not empty. Instead, just add the functions that are in the filter_hash of the subops but not in the notrace_hash of the subops into the main ops filter_hash. There's no reason to add anything to the main ops notrace_hash. The notrace_hash of the main ops should only be non empty iff all subops filter_hashes are empty (meaning to trace all functions) and all subops notrace_hashes include the same functions. That is, the main ops notrace_hash is empty if any subops filter_hash is non empty. The main ops notrace_hash only has content in it if all subops filter_hashes are empty, and the content are only functions that intersect all the subops notrace_hashes. If any subops notrace_hash is empty, then so is the main ops notrace_hash. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Shuah Khan Cc: Andy Chiu Link: https://lore.kernel.org/20250409152720.216356767@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 314 ++++++++++++++++++++++++++++---------------------- 1 file changed, 177 insertions(+), 137 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8939eeebb02e..a8a02868b435 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3255,6 +3255,31 @@ static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash, return 0; } +/* + * Remove functions from @hash that are in @notrace_hash + */ +static void remove_hash(struct ftrace_hash *hash, struct ftrace_hash *notrace_hash) +{ + struct ftrace_func_entry *entry; + struct hlist_node *tmp; + int size; + int i; + + /* If the notrace hash is empty, there's nothing to do */ + if (ftrace_hash_empty(notrace_hash)) + return; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) { + if (!__ftrace_lookup_ip(notrace_hash, entry->ip)) + continue; + remove_hash_entry(hash, entry); + kfree(entry); + } + } +} + /* * Add to @hash only those that are in both @new_hash1 and @new_hash2 * @@ -3295,67 +3320,6 @@ static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_has return 0; } -/* Return a new hash that has a union of all @ops->filter_hash entries */ -static struct ftrace_hash *append_hashes(struct ftrace_ops *ops) -{ - struct ftrace_hash *new_hash = NULL; - struct ftrace_ops *subops; - int size_bits; - int ret; - - if (ops->func_hash->filter_hash) - size_bits = ops->func_hash->filter_hash->size_bits; - else - size_bits = FTRACE_HASH_DEFAULT_BITS; - - list_for_each_entry(subops, &ops->subop_list, list) { - ret = append_hash(&new_hash, subops->func_hash->filter_hash, size_bits); - if (ret < 0) { - free_ftrace_hash(new_hash); - return NULL; - } - /* Nothing more to do if new_hash is empty */ - if (ftrace_hash_empty(new_hash)) - break; - } - /* Can't return NULL as that means this failed */ - return new_hash ? : EMPTY_HASH; -} - -/* Make @ops trace evenything except what all its subops do not trace */ -static struct ftrace_hash *intersect_hashes(struct ftrace_ops *ops) -{ - struct ftrace_hash *new_hash = NULL; - struct ftrace_ops *subops; - int size_bits; - int ret; - - list_for_each_entry(subops, &ops->subop_list, list) { - struct ftrace_hash *next_hash; - - if (!new_hash) { - size_bits = subops->func_hash->notrace_hash->size_bits; - new_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->notrace_hash); - if (!new_hash) - return NULL; - continue; - } - size_bits = new_hash->size_bits; - next_hash = new_hash; - new_hash = alloc_ftrace_hash(size_bits); - ret = intersect_hash(&new_hash, next_hash, subops->func_hash->notrace_hash); - free_ftrace_hash(next_hash); - if (ret < 0) { - free_ftrace_hash(new_hash); - return NULL; - } - /* Nothing more to do if new_hash is empty */ - if (ftrace_hash_empty(new_hash)) - break; - } - return new_hash; -} - static bool ops_equal(struct ftrace_hash *A, struct ftrace_hash *B) { struct ftrace_func_entry *entry; @@ -3427,6 +3391,93 @@ static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_ return 0; } +static int add_first_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash, + struct ftrace_ops_hash *func_hash) +{ + /* If the filter hash is not empty, simply remove the nohash from it */ + if (!ftrace_hash_empty(func_hash->filter_hash)) { + *filter_hash = copy_hash(func_hash->filter_hash); + if (!*filter_hash) + return -ENOMEM; + remove_hash(*filter_hash, func_hash->notrace_hash); + *notrace_hash = EMPTY_HASH; + + } else { + *notrace_hash = copy_hash(func_hash->notrace_hash); + if (!*notrace_hash) + return -ENOMEM; + *filter_hash = EMPTY_HASH; + } + return 0; +} + +static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash, + struct ftrace_ops_hash *ops_hash, struct ftrace_ops_hash *subops_hash) +{ + int size_bits; + int ret; + + /* If the subops trace all functions so must the main ops */ + if (ftrace_hash_empty(ops_hash->filter_hash) || + ftrace_hash_empty(subops_hash->filter_hash)) { + *filter_hash = EMPTY_HASH; + } else { + /* + * The main ops filter hash is not empty, so its + * notrace_hash had better be, as the notrace hash + * is only used for empty main filter hashes. + */ + WARN_ON_ONCE(!ftrace_hash_empty(ops_hash->notrace_hash)); + + size_bits = max(ops_hash->filter_hash->size_bits, + subops_hash->filter_hash->size_bits); + + /* Copy the subops hash */ + *filter_hash = alloc_and_copy_ftrace_hash(size_bits, subops_hash->filter_hash); + if (!filter_hash) + return -ENOMEM; + /* Remove any notrace functions from the copy */ + remove_hash(*filter_hash, subops_hash->notrace_hash); + + ret = append_hash(filter_hash, ops_hash->filter_hash, + size_bits); + if (ret < 0) { + free_ftrace_hash(*filter_hash); + return ret; + } + } + + /* + * Only process notrace hashes if the main filter hash is empty + * (tracing all functions), otherwise the filter hash will just + * remove the notrace hash functions, and the notrace hash is + * not needed. + */ + if (ftrace_hash_empty(*filter_hash)) { + /* + * Intersect the notrace functions. That is, if two + * subops are not tracing a set of functions, the + * main ops will only not trace the functions that are + * in both subops, but has to trace the functions that + * are only notrace in one of the subops, for the other + * subops to be able to trace them. + */ + size_bits = max(ops_hash->notrace_hash->size_bits, + subops_hash->notrace_hash->size_bits); + *notrace_hash = alloc_ftrace_hash(size_bits); + if (!*notrace_hash) + return -ENOMEM; + + ret = intersect_hash(notrace_hash, ops_hash->notrace_hash, + subops_hash->notrace_hash); + if (ret < 0) { + free_ftrace_hash(*notrace_hash); + return ret; + } + } + return 0; +} + /** * ftrace_startup_subops - enable tracing for subops of an ops * @ops: Manager ops (used to pick all the functions of its subops) @@ -3443,7 +3494,6 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int struct ftrace_hash *notrace_hash; struct ftrace_hash *save_filter_hash; struct ftrace_hash *save_notrace_hash; - int size_bits; int ret; if (unlikely(ftrace_disabled)) @@ -3467,14 +3517,14 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int /* For the first subops to ops just enable it normally */ if (list_empty(&ops->subop_list)) { - /* Just use the subops hashes */ - filter_hash = copy_hash(subops->func_hash->filter_hash); - notrace_hash = copy_hash(subops->func_hash->notrace_hash); - if (!filter_hash || !notrace_hash) { - free_ftrace_hash(filter_hash); - free_ftrace_hash(notrace_hash); - return -ENOMEM; - } + + /* The ops was empty, should have empty hashes */ + WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->filter_hash)); + WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->notrace_hash)); + + ret = add_first_hash(&filter_hash, ¬race_hash, subops->func_hash); + if (ret < 0) + return ret; save_filter_hash = ops->func_hash->filter_hash; save_notrace_hash = ops->func_hash->notrace_hash; @@ -3500,48 +3550,16 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int /* * Here there's already something attached. Here are the rules: - * o If either filter_hash is empty then the final stays empty - * o Otherwise, the final is a superset of both hashes - * o If either notrace_hash is empty then the final stays empty - * o Otherwise, the final is an intersection between the hashes + * If the new subops and main ops filter hashes are not empty: + * o Make a copy of the subops filter hash + * o Remove all functions in the nohash from it. + * o Add in the main hash filter functions + * o Remove any of these functions from the main notrace hash */ - if (ftrace_hash_empty(ops->func_hash->filter_hash) || - ftrace_hash_empty(subops->func_hash->filter_hash)) { - filter_hash = EMPTY_HASH; - } else { - size_bits = max(ops->func_hash->filter_hash->size_bits, - subops->func_hash->filter_hash->size_bits); - filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash); - if (!filter_hash) - return -ENOMEM; - ret = append_hash(&filter_hash, subops->func_hash->filter_hash, - size_bits); - if (ret < 0) { - free_ftrace_hash(filter_hash); - return ret; - } - } - - if (ftrace_hash_empty(ops->func_hash->notrace_hash) || - ftrace_hash_empty(subops->func_hash->notrace_hash)) { - notrace_hash = EMPTY_HASH; - } else { - size_bits = max(ops->func_hash->notrace_hash->size_bits, - subops->func_hash->notrace_hash->size_bits); - notrace_hash = alloc_ftrace_hash(size_bits); - if (!notrace_hash) { - free_ftrace_hash(filter_hash); - return -ENOMEM; - } - ret = intersect_hash(¬race_hash, ops->func_hash->notrace_hash, - subops->func_hash->notrace_hash); - if (ret < 0) { - free_ftrace_hash(filter_hash); - free_ftrace_hash(notrace_hash); - return ret; - } - } + ret = add_next_hash(&filter_hash, ¬race_hash, ops->func_hash, subops->func_hash); + if (ret < 0) + return ret; list_add(&subops->list, &ops->subop_list); @@ -3557,6 +3575,42 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int return ret; } +static int rebuild_hashes(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash, + struct ftrace_ops *ops) +{ + struct ftrace_ops_hash temp_hash; + struct ftrace_ops *subops; + bool first = true; + int ret; + + temp_hash.filter_hash = EMPTY_HASH; + temp_hash.notrace_hash = EMPTY_HASH; + + list_for_each_entry(subops, &ops->subop_list, list) { + *filter_hash = EMPTY_HASH; + *notrace_hash = EMPTY_HASH; + + if (first) { + ret = add_first_hash(filter_hash, notrace_hash, subops->func_hash); + if (ret < 0) + return ret; + first = false; + } else { + ret = add_next_hash(filter_hash, notrace_hash, + &temp_hash, subops->func_hash); + if (ret < 0) { + free_ftrace_hash(temp_hash.filter_hash); + free_ftrace_hash(temp_hash.notrace_hash); + return ret; + } + } + + temp_hash.filter_hash = *filter_hash; + temp_hash.notrace_hash = *notrace_hash; + } + return 0; +} + /** * ftrace_shutdown_subops - Remove a subops from a manager ops * @ops: A manager ops to remove @subops from @@ -3605,14 +3659,9 @@ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, in } /* Rebuild the hashes without subops */ - filter_hash = append_hashes(ops); - notrace_hash = intersect_hashes(ops); - if (!filter_hash || !notrace_hash) { - free_ftrace_hash(filter_hash); - free_ftrace_hash(notrace_hash); - list_add(&subops->list, &ops->subop_list); - return -ENOMEM; - } + ret = rebuild_hashes(&filter_hash, ¬race_hash, ops); + if (ret < 0) + return ret; ret = ftrace_update_ops(ops, filter_hash, notrace_hash); if (ret < 0) { @@ -3628,11 +3677,11 @@ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, in static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops, struct ftrace_hash **orig_subhash, - struct ftrace_hash *hash, - int enable) + struct ftrace_hash *hash) { struct ftrace_ops *ops = subops->managed; - struct ftrace_hash **orig_hash; + struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash; struct ftrace_hash *save_hash; struct ftrace_hash *new_hash; int ret; @@ -3649,24 +3698,15 @@ static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops, return -ENOMEM; } - /* Create a new_hash to hold the ops new functions */ - if (enable) { - orig_hash = &ops->func_hash->filter_hash; - new_hash = append_hashes(ops); - } else { - orig_hash = &ops->func_hash->notrace_hash; - new_hash = intersect_hashes(ops); - } - - /* Move the hash over to the new hash */ - ret = __ftrace_hash_move_and_update_ops(ops, orig_hash, new_hash, enable); - - free_ftrace_hash(new_hash); + ret = rebuild_hashes(&filter_hash, ¬race_hash, ops); + if (!ret) + ret = ftrace_update_ops(ops, filter_hash, notrace_hash); if (ret) { /* Put back the original hash */ - free_ftrace_hash_rcu(*orig_subhash); + new_hash = *orig_subhash; *orig_subhash = save_hash; + free_ftrace_hash_rcu(new_hash); } else { free_ftrace_hash_rcu(save_hash); } @@ -4890,7 +4930,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, int enable) { if (ops->flags & FTRACE_OPS_FL_SUBOP) - return ftrace_hash_move_and_update_subops(ops, orig_hash, hash, enable); + return ftrace_hash_move_and_update_subops(ops, orig_hash, hash); /* * If this ops is not enabled, it could be sharing its filters @@ -4909,7 +4949,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, list_for_each_entry(subops, &op->subop_list, list) { if ((subops->flags & FTRACE_OPS_FL_ENABLED) && subops->func_hash == ops->func_hash) { - return ftrace_hash_move_and_update_subops(subops, orig_hash, hash, enable); + return ftrace_hash_move_and_update_subops(subops, orig_hash, hash); } } } while_for_each_ftrace_op(op); -- cgit v1.2.3 From 485acd207d7daf8cf941a5f0fd0c09bc6d049402 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Apr 2025 13:30:15 -0400 Subject: ftrace: Do not have print_graph_retval() add a newline The retval and retaddr options for function_graph tracer will add a comment at the end of a function for both leaf and non leaf functions that looks like: __wake_up_common(); /* ret=0x1 */ } /* pick_next_task_fair ret=0x0 */ The function print_graph_retval() adds a newline after the "*/". But if that's not called, the caller function needs to make sure there's a newline added. This is confusing and when the function parameters code was added, it added a newline even when calling print_graph_retval() as the fact that the print_graph_retval() function prints a newline isn't obvious. This caused an extra newline to be printed and that made it fail the selftests when the retval option was set, as the selftests were not expecting blank lines being injected into the trace. Instead of having print_graph_retval() print a newline, just have the caller always print the newline regardless if it calls print_graph_retval() or not. This not only fixes this bug, but it also simplifies the code. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250411133015.015ca393@gandalf.local.home Reported-by: Mark Brown Tested-by: Mark Brown Closes: https://lore.kernel.org/all/ccc40f2b-4b9e-4abd-8daf-d22fce2a86f0@sirena.org.uk/ Fixes: ff5c9c576e754 ("ftrace: Add support for function argument to graph tracer") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 2f077d4158e5..0c357a89c58e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -880,8 +880,6 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr if (print_retval || print_retaddr) trace_seq_puts(s, " /*"); - else - trace_seq_putc(s, '\n'); } else { print_retaddr = false; trace_seq_printf(s, "} /* %ps", func); @@ -899,7 +897,7 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr } if (!entry || print_retval || print_retaddr) - trace_seq_puts(s, " */\n"); + trace_seq_puts(s, " */"); } #else @@ -975,7 +973,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, } else trace_seq_puts(s, "();"); } - trace_seq_printf(s, "\n"); + trace_seq_putc(s, '\n'); print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, cpu, iter->ent->pid, flags); @@ -1313,10 +1311,11 @@ print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s, * that if the funcgraph-tail option is enabled. */ if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) - trace_seq_puts(s, "}\n"); + trace_seq_puts(s, "}"); else - trace_seq_printf(s, "} /* %ps */\n", (void *)func); + trace_seq_printf(s, "} /* %ps */", (void *)func); } + trace_seq_putc(s, '\n'); /* Overrun */ if (flags & TRACE_GRAPH_PRINT_OVERRUN) -- cgit v1.2.3 From 8d7861ac507d23024c7d74b6cb59a9cca248bcb7 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Fri, 11 Apr 2025 09:37:17 +0200 Subject: rv: Fix out-of-bound memory access in rv_is_container_monitor() When rv_is_container_monitor() is called on the last monitor in rv_monitors_list, KASAN yells: BUG: KASAN: global-out-of-bounds in rv_is_container_monitor+0x101/0x110 Read of size 8 at addr ffffffff97c7c798 by task setup/221 The buggy address belongs to the variable: rv_monitors_list+0x18/0x40 This is due to list_next_entry() is called on the last entry in the list. It wraps around to the first list_head, and the first list_head is not embedded in struct rv_monitor_def. Fix it by checking if the monitor is last in the list. Cc: stable@vger.kernel.org Cc: Gabriele Monaco Fixes: cb85c660fcd4 ("rv: Add option for nested monitors and include sched") Link: https://lore.kernel.org/e85b5eeb7228bfc23b8d7d4ab5411472c54ae91b.1744355018.git.namcao@linutronix.de Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 968c5c3b0246..e4077500a91d 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -225,7 +225,12 @@ bool rv_is_nested_monitor(struct rv_monitor_def *mdef) */ bool rv_is_container_monitor(struct rv_monitor_def *mdef) { - struct rv_monitor_def *next = list_next_entry(mdef, list); + struct rv_monitor_def *next; + + if (list_is_last(&mdef->list, &rv_monitors_list)) + return false; + + next = list_next_entry(mdef, list); return next->parent == mdef->monitor || !mdef->monitor->enable; } -- cgit v1.2.3 From 31d1139956112dd047a70b263f4d578921de779a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Apr 2025 10:40:17 -0400 Subject: ftrace: Initialize variables for ftrace_startup/shutdown_subops() The reworking to fix and simplify the ftrace_startup_subops() and the ftrace_shutdown_subops() made it possible for the filter_hash and notrace_hash variables to be used uninitialized in a way that the compiler did not catch it. Initialize both filter_hash and notrace_hash to the EMPTY_HASH as that is what they should be if they never are used. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250417104017.3aea66c2@gandalf.local.home Reported-by: Venkat Rao Bagalkote Tested-by: Venkat Rao Bagalkote Fixes: 0ae6b8ce200d ("ftrace: Fix accounting of subop hashes") Closes: https://lore.kernel.org/all/1db64a42-626d-4b3a-be08-c65e47333ce2@linux.ibm.com/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a8a02868b435..43394445390c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3490,8 +3490,8 @@ static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash ** */ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash = EMPTY_HASH; + struct ftrace_hash *notrace_hash = EMPTY_HASH; struct ftrace_hash *save_filter_hash; struct ftrace_hash *save_notrace_hash; int ret; @@ -3625,8 +3625,8 @@ static int rebuild_hashes(struct ftrace_hash **filter_hash, struct ftrace_hash * */ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash = EMPTY_HASH; + struct ftrace_hash *notrace_hash = EMPTY_HASH; int ret; if (unlikely(ftrace_disabled)) -- cgit v1.2.3 From 08275e59a75047ba8fc0b9853bfdfc88a124763d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Apr 2025 11:09:33 -0400 Subject: ftrace: Reinitialize hash to EMPTY_HASH after freeing There's several locations that free a ftrace hash pointer but may be referenced again. Reset them to EMPTY_HASH so that a u-a-f bug doesn't happen. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250417110933.20ab718b@gandalf.local.home Fixes: 0ae6b8ce200d ("ftrace: Fix accounting of subop hashes") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 43394445390c..d0e4a902bb40 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1297,6 +1297,8 @@ void ftrace_free_filter(struct ftrace_ops *ops) return; free_ftrace_hash(ops->func_hash->filter_hash); free_ftrace_hash(ops->func_hash->notrace_hash); + ops->func_hash->filter_hash = EMPTY_HASH; + ops->func_hash->notrace_hash = EMPTY_HASH; } EXPORT_SYMBOL_GPL(ftrace_free_filter); @@ -3443,6 +3445,7 @@ static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash ** size_bits); if (ret < 0) { free_ftrace_hash(*filter_hash); + *filter_hash = EMPTY_HASH; return ret; } } @@ -3472,6 +3475,7 @@ static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash ** subops_hash->notrace_hash); if (ret < 0) { free_ftrace_hash(*notrace_hash); + *notrace_hash = EMPTY_HASH; return ret; } } -- cgit v1.2.3 From c45c585dde535e5ae2c363594bde3e05ce94a296 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Apr 2025 13:59:39 -0400 Subject: ftrace: Free ftrace hashes after they are replaced in the subops code The subops processing creates new hashes when adding and removing subops. There were some places that the old hashes that were replaced were not freed and this caused some memory leaks. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250417135939.245b128d@gandalf.local.home Fixes: 0ae6b8ce200d ("ftrace: Fix accounting of subop hashes") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d0e4a902bb40..41dcfcf8b40a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3609,6 +3609,9 @@ static int rebuild_hashes(struct ftrace_hash **filter_hash, struct ftrace_hash * } } + free_ftrace_hash(temp_hash.filter_hash); + free_ftrace_hash(temp_hash.notrace_hash); + temp_hash.filter_hash = *filter_hash; temp_hash.notrace_hash = *notrace_hash; } @@ -3703,8 +3706,11 @@ static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops, } ret = rebuild_hashes(&filter_hash, ¬race_hash, ops); - if (!ret) + if (!ret) { ret = ftrace_update_ops(ops, filter_hash, notrace_hash); + free_ftrace_hash(filter_hash); + free_ftrace_hash(notrace_hash); + } if (ret) { /* Put back the original hash */ -- cgit v1.2.3 From 92f1d3b40179b15630d72e2c6e4e25a899b67ba9 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 13 Apr 2025 09:44:44 +0800 Subject: ftrace: fix incorrect hash size in register_ftrace_direct() The maximum of the ftrace hash bits is made fls(32) in register_ftrace_direct(), which seems illogical. So, we fix it by making the max hash bits FTRACE_HASH_MAX_BITS instead. Link: https://lore.kernel.org/20250413014444.36724-1-dongml2@chinatelecom.cn Fixes: d05cb470663a ("ftrace: Fix modification of direct_function hash while in use") Signed-off-by: Menglong Dong Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 41dcfcf8b40a..61130bb34d6c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5964,9 +5964,10 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) /* Make a copy hash to place the new and the old entries in */ size = hash->count + direct_functions->count; - if (size > 32) - size = 32; - new_hash = alloc_ftrace_hash(fls(size)); + size = fls(size); + if (size > FTRACE_HASH_MAX_BITS) + size = FTRACE_HASH_MAX_BITS; + new_hash = alloc_ftrace_hash(size); if (!new_hash) goto out_unlock; -- cgit v1.2.3 From 3b4e87e6a593d571183c414d81758624da01f2b9 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Sun, 13 Apr 2025 00:10:43 +0200 Subject: ftrace: Fix type of ftrace_graph_ent_entry.depth ftrace_graph_ent.depth is int, but ftrace_graph_ent_entry.depth is unsigned long. This confuses trace-cmd on 64-bit big-endian systems and makes it print a huge amount of spaces. Fix this by using unsigned int, which has a matching size, instead. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Sven Schnelle Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Link: https://lore.kernel.org/20250412221847.17310-2-iii@linux.ibm.com Fixes: ff5c9c576e75 ("ftrace: Add support for function argument to graph tracer") Signed-off-by: Ilya Leoshkevich Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_entries.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ee40d4e6ad1c..4ef4df6623a8 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -80,11 +80,11 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, F_STRUCT( __field_struct( struct ftrace_graph_ent, graph_ent ) __field_packed( unsigned long, graph_ent, func ) - __field_packed( unsigned long, graph_ent, depth ) + __field_packed( unsigned int, graph_ent, depth ) __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%lu)", (void *)__entry->func, __entry->depth) + F_printk("--> %ps (%u)", (void *)__entry->func, __entry->depth) ); #ifdef CONFIG_FUNCTION_GRAPH_RETADDR -- cgit v1.2.3 From a8c5b0ed89a3f2c81c6ae0b041394e6eea0e7024 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Apr 2025 18:30:03 -0400 Subject: tracing: Fix filter string testing The filter string testing uses strncpy_from_kernel/user_nofault() to retrieve the string to test the filter against. The if() statement was incorrect as it considered 0 as a fault, when it is only negative that it faulted. Running the following commands: # cd /sys/kernel/tracing # echo "filename.ustring ~ \"/proc*\"" > events/syscalls/sys_enter_openat/filter # echo 1 > events/syscalls/sys_enter_openat/enable # ls /proc/$$/maps # cat trace Would produce nothing, but with the fix it will produce something like: ls-1192 [007] ..... 8169.828333: sys_openat(dfd: ffffffffffffff9c, filename: 7efc18359904, flags: 80000, mode: 0) Link: https://lore.kernel.org/all/CAEf4BzbVPQ=BjWztmEwBPRKHUwNfKBkS3kce-Rzka6zvbQeVpg@mail.gmail.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250417183003.505835fb@gandalf.local.home Fixes: 77360f9bbc7e5 ("tracing: Add test for user space strings when filtering on string pointers") Reported-by: Andrii Nakryiko Reported-by: Mykyta Yatsenko Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0993dfc1c5c1..2048560264bb 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -808,7 +808,7 @@ static __always_inline char *test_string(char *str) kstr = ubuf->buffer; /* For safety, do not trust the string pointer */ - if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + if (strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE) < 0) return NULL; return kstr; } @@ -827,7 +827,7 @@ static __always_inline char *test_ustring(char *str) /* user space address? */ ustr = (char __user *)str; - if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + if (strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE) < 0) return NULL; return kstr; -- cgit v1.2.3 From f5178c41bb43444a6008150fe6094497135d07cb Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Tue, 22 Apr 2025 20:30:25 +0900 Subject: tracing: Fix oob write in trace_seq_to_buffer() syzbot reported this bug: ================================================================== BUG: KASAN: slab-out-of-bounds in trace_seq_to_buffer kernel/trace/trace.c:1830 [inline] BUG: KASAN: slab-out-of-bounds in tracing_splice_read_pipe+0x6be/0xdd0 kernel/trace/trace.c:6822 Write of size 4507 at addr ffff888032b6b000 by task syz.2.320/7260 CPU: 1 UID: 0 PID: 7260 Comm: syz.2.320 Not tainted 6.15.0-rc1-syzkaller-00301-g3bde70a2c827 #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2025 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:408 [inline] print_report+0xc3/0x670 mm/kasan/report.c:521 kasan_report+0xe0/0x110 mm/kasan/report.c:634 check_region_inline mm/kasan/generic.c:183 [inline] kasan_check_range+0xef/0x1a0 mm/kasan/generic.c:189 __asan_memcpy+0x3c/0x60 mm/kasan/shadow.c:106 trace_seq_to_buffer kernel/trace/trace.c:1830 [inline] tracing_splice_read_pipe+0x6be/0xdd0 kernel/trace/trace.c:6822 .... ================================================================== It has been reported that trace_seq_to_buffer() tries to copy more data than PAGE_SIZE to buf. Therefore, to prevent this, we should use the smaller of trace_seq_used(&iter->seq) and PAGE_SIZE as an argument. Link: https://lore.kernel.org/20250422113026.13308-1-aha310510@gmail.com Reported-by: syzbot+c8cd2d2c412b868263fb@syzkaller.appspotmail.com Fixes: 3c56819b14b0 ("tracing: splice support for tracing_pipe") Suggested-by: Steven Rostedt Signed-off-by: Jeongjun Park Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8ddf6b17215c..6d52dc108f00 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6821,13 +6821,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), - trace_seq_used(&iter->seq)); + min((size_t)trace_seq_used(&iter->seq), + PAGE_SIZE)); if (ret < 0) { __free_page(spd.pages[i]); break; } spd.partial[i].offset = 0; - spd.partial[i].len = trace_seq_used(&iter->seq); + spd.partial[i].len = ret; trace_seq_init(&iter->seq); } -- cgit v1.2.3 From 3c1d9cfa8458a4d6b6cd9bc3ca7bb1591130a31c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 22 Apr 2025 23:13:35 +0100 Subject: ftrace: Fix NULL memory allocation check The check for a failed memory location is incorrectly checking the wrong level of pointer indirection by checking !filter_hash rather than !*filter_hash. Fix this. Cc: asami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250422221335.89896-1-colin.i.king@gmail.com Fixes: 0ae6b8ce200d ("ftrace: Fix accounting of subop hashes") Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 61130bb34d6c..6981830c3128 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3436,7 +3436,7 @@ static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash ** /* Copy the subops hash */ *filter_hash = alloc_and_copy_ftrace_hash(size_bits, subops_hash->filter_hash); - if (!filter_hash) + if (!*filter_hash) return -ENOMEM; /* Remove any notrace functions from the copy */ remove_hash(*filter_hash, subops_hash->notrace_hash); -- cgit v1.2.3 From 1be8e54a1e0f0a4bf70e3d65f94ca1738ee4f1f3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 1 May 2025 15:19:09 -0400 Subject: tracing: Fix trace_adjust_address() when there is no modules in scratch area The function trace_adjust_address() is used to map addresses of modules stored in the persistent memory and are also loaded in the current boot to return the current address for the module. If there's only one module entry, it will simply use that, otherwise it performs a bsearch of the entry array to find the modules to offset with. The issue is if there are no modules in the array. The code does not account for that and ends up referencing the first element in the array which does not exist and causes a crash. If nr_entries is zero, exit out early as if this was a core kernel address. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250501151909.65910359@gandalf.local.home Fixes: 35a380ddbc653 ("tracing: Show last module text symbols in the stacktrace") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6d52dc108f00..5b8db27fb6ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6043,8 +6043,10 @@ unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) tscratch = tr->scratch; /* if there is no tscrach, module_delta must be NULL. */ module_delta = READ_ONCE(tr->module_delta); - if (!module_delta || tscratch->entries[0].mod_addr > addr) + if (!module_delta || !tscratch->nr_entries || + tscratch->entries[0].mod_addr > addr) { return addr + tr->text_delta; + } /* Note that entries must be sorted. */ nr_entries = tscratch->nr_entries; -- cgit v1.2.3 From 0a8f11f8569e7ed16cbcedeb28c4350f6378fea6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 1 May 2025 22:41:28 -0400 Subject: tracing: Do not take trace_event_sem in print_event_fields() On some paths in print_event_fields() it takes the trace_event_sem for read, even though it should always be held when the function is called. Remove the taking of that mutex and add a lockdep_assert_held_read() to make sure the trace_event_sem is held when print_event_fields() is called. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250501224128.0b1f0571@batman.local.home Fixes: 80a76994b2d88 ("tracing: Add "fields" option to show raw trace event fields") Reported-by: syzbot+441582c1592938fccf09@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/6813ff5e.050a0220.14dd7d.001b.GAE@google.com/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index fee40ffbd490..b9ab06c99543 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1042,11 +1042,12 @@ enum print_line_t print_event_fields(struct trace_iterator *iter, struct trace_event_call *call; struct list_head *head; + lockdep_assert_held_read(&trace_event_sem); + /* ftrace defined events have separate call structures */ if (event->type <= __TRACE_LAST_TYPE) { bool found = false; - down_read(&trace_event_sem); list_for_each_entry(call, &ftrace_events, list) { if (call->event.type == event->type) { found = true; @@ -1056,7 +1057,6 @@ enum print_line_t print_event_fields(struct trace_iterator *iter, if (call->event.type > __TRACE_LAST_TYPE) break; } - up_read(&trace_event_sem); if (!found) { trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type); goto out; -- cgit v1.2.3 From 9dda18a32b4a6693fccd3f7c0738af646147b1cf Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 10 Apr 2025 05:22:21 -0700 Subject: tracing: fprobe: Fix RCU warning message in list traversal When CONFIG_PROVE_RCU_LIST is enabled, fprobe triggers the following warning: WARNING: suspicious RCU usage kernel/trace/fprobe.c:457 RCU-list traversed in non-reader section!! other info that might help us debug this: #1: ffffffff863c4e08 (fprobe_mutex){+.+.}-{4:4}, at: fprobe_module_callback+0x7b/0x8c0 Call Trace: fprobe_module_callback notifier_call_chain blocking_notifier_call_chain This warning occurs because fprobe_remove_node_in_module() traverses an RCU list using RCU primitives without holding an RCU read lock. However, the function is only called from fprobe_module_callback(), which holds the fprobe_mutex lock that provides sufficient protection for safely traversing the list. Fix the warning by specifying the locking design to the CONFIG_PROVE_RCU_LIST mechanism. Add the lockdep_is_held() argument to hlist_for_each_entry_rcu() to inform the RCU checker that fprobe_mutex provides the required protection. Link: https://lore.kernel.org/all/20250410-fprobe-v1-1-068ef5f41436@debian.org/ Fixes: a3dc2983ca7b90 ("tracing: fprobe: Cleanup fprobe hash when module unloading") Signed-off-by: Breno Leitao Tested-by: Antonio Quartulli Tested-by: Matthieu Baerts (NGI0) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/fprobe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 95c6e3473a76..ba7ff14f5339 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -454,7 +454,8 @@ static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head * struct fprobe_hlist_node *node; int ret = 0; - hlist_for_each_entry_rcu(node, head, hlist) { + hlist_for_each_entry_rcu(node, head, hlist, + lockdep_is_held(&fprobe_mutex)) { if (!within_module(node->addr, mod)) continue; if (delete_fprobe_node(node)) -- cgit v1.2.3 From e41b5af4519f90f9a751805ede2102ae36caf5d0 Mon Sep 17 00:00:00 2001 From: Paul Cacheux Date: Sun, 4 May 2025 20:27:52 +0200 Subject: tracing: add missing trace_probe_log_clear for eprobes Make sure trace_probe_log_clear is called in the tracing eprobe code path, matching the trace_probe_log_init call. Link: https://lore.kernel.org/all/20250504-fix-trace-probe-log-race-v3-1-9e99fec7eddc@gmail.com/ Signed-off-by: Paul Cacheux Acked-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_eprobe.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index c08355c3ef32..916555f0de81 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -969,10 +969,13 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } } + trace_probe_log_clear(); return ret; + parse_error: ret = -EINVAL; error: + trace_probe_log_clear(); trace_event_probe_cleanup(ep); return ret; } -- cgit v1.2.3 From fd837de3c9cb1a162c69bc1fb1f438467fe7f2f5 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 10 May 2025 12:44:41 +0900 Subject: tracing: probes: Fix a possible race in trace_probe_log APIs Since the shared trace_probe_log variable can be accessed and modified via probe event create operation of kprobe_events, uprobe_events, and dynamic_events, it should be protected. In the dynamic_events, all operations are serialized by `dyn_event_ops_mutex`. But kprobe_events and uprobe_events interfaces are not serialized. To solve this issue, introduces dyn_event_create(), which runs create() operation under the mutex, for kprobe_events and uprobe_events. This also uses lockdep to check the mutex is held when using trace_probe_log* APIs. Link: https://lore.kernel.org/all/174684868120.551552.3068655787654268804.stgit@devnote2/ Reported-by: Paul Cacheux Closes: https://lore.kernel.org/all/20250510074456.805a16872b591e2971a4d221@kernel.org/ Fixes: ab105a4fb894 ("tracing: Use tracing error_log with probe events") Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_dynevent.c | 16 +++++++++++++++- kernel/trace/trace_dynevent.h | 1 + kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_probe.c | 9 +++++++++ kernel/trace/trace_uprobe.c | 2 +- 5 files changed, 27 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index a322e4f249a5..5d64a18cacac 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -16,7 +16,7 @@ #include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" -static DEFINE_MUTEX(dyn_event_ops_mutex); +DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) @@ -116,6 +116,20 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type return ret; } +/* + * Locked version of event creation. The event creation must be protected by + * dyn_event_ops_mutex because of protecting trace_probe_log. + */ +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type) +{ + int ret; + + mutex_lock(&dyn_event_ops_mutex); + ret = type->create(raw_command); + mutex_unlock(&dyn_event_ops_mutex); + return ret; +} + static int create_dyn_event(const char *raw_command) { struct dyn_event_operations *ops; diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 936477a111d3..beee3f8d7544 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); void dyn_event_seq_stop(struct seq_file *m, void *v); int dyn_events_release_all(struct dyn_event_operations *type); int dyn_event_release(const char *raw_command, struct dyn_event_operations *type); +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type); /* * for_each_dyn_event - iterate over the dyn_event list diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2703b96d8990..3e5c47b6d7b2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1089,7 +1089,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_kprobe_ops); - ret = trace_kprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_kprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2eeecb6c95ee..424751cdf31f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -154,9 +154,12 @@ fail: } static struct trace_probe_log trace_probe_log; +extern struct mutex dyn_event_ops_mutex; void trace_probe_log_init(const char *subsystem, int argc, const char **argv) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.subsystem = subsystem; trace_probe_log.argc = argc; trace_probe_log.argv = argv; @@ -165,11 +168,15 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv) void trace_probe_log_clear(void) { + lockdep_assert_held(&dyn_event_ops_mutex); + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); } void trace_probe_log_set_index(int index) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.index = index; } @@ -178,6 +185,8 @@ void __trace_probe_log_err(int offset, int err_type) char *command, *p; int i, len = 0, pos = 0; + lockdep_assert_held(&dyn_event_ops_mutex); + if (!trace_probe_log.argv) return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3386439ec9f6..35cf76c75dd7 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -741,7 +741,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_uprobe_ops); - ret = trace_uprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_uprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } -- cgit v1.2.3 From e333332657f615ac2b55aa35565c4a882018bbe9 Mon Sep 17 00:00:00 2001 From: pengdonglin Date: Mon, 12 May 2025 17:42:45 +0800 Subject: ftrace: Fix preemption accounting for stacktrace trigger command When using the stacktrace trigger command to trace syscalls, the preemption count was consistently reported as 1 when the system call event itself had 0 ("."). For example: root@ubuntu22-vm:/sys/kernel/tracing/events/syscalls/sys_enter_read $ echo stacktrace > trigger $ echo 1 > enable sshd-416 [002] ..... 232.864910: sys_read(fd: a, buf: 556b1f3221d0, count: 8000) sshd-416 [002] ...1. 232.864913: => ftrace_syscall_enter => syscall_trace_enter => do_syscall_64 => entry_SYSCALL_64_after_hwframe The root cause is that the trace framework disables preemption in __DO_TRACE before invoking the trigger callback. Use the tracing_gen_ctx_dec() that will accommodate for the increase of the preemption count in __DO_TRACE when calling the callback. The result is the accurate reporting of: sshd-410 [004] ..... 210.117660: sys_read(fd: 4, buf: 559b725ba130, count: 40000) sshd-410 [004] ..... 210.117662: => ftrace_syscall_enter => syscall_trace_enter => do_syscall_64 => entry_SYSCALL_64_after_hwframe Cc: stable@vger.kernel.org Fixes: ce33c845b030c ("tracing: Dump stacktrace trigger to the corresponding instance") Link: https://lore.kernel.org/20250512094246.1167956-1-dolinux.peng@gmail.com Signed-off-by: pengdonglin Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index b66b6d235d91..6e87ae2a1a66 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1560,7 +1560,7 @@ stacktrace_trigger(struct event_trigger_data *data, struct trace_event_file *file = data->private_data; if (file) - __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP); else trace_dump_stack(STACK_SKIP); } -- cgit v1.2.3 From 11aff32439df6ca5b3b891b43032faf88f4a6a29 Mon Sep 17 00:00:00 2001 From: pengdonglin Date: Mon, 12 May 2025 17:42:46 +0800 Subject: ftrace: Fix preemption accounting for stacktrace filter command The preemption count of the stacktrace filter command to trace ksys_read is consistently incorrect: $ echo ksys_read:stacktrace > set_ftrace_filter <...>-453 [004] ...1. 38.308956: => ksys_read => do_syscall_64 => entry_SYSCALL_64_after_hwframe The root cause is that the trace framework disables preemption when invoking the filter command callback in function_trace_probe_call: preempt_disable_notrace(); probe_ops->func(ip, parent_ip, probe_opsbe->tr, probe_ops, probe->data); preempt_enable_notrace(); Use tracing_gen_ctx_dec() to account for the preempt_disable_notrace(), which will output the correct preemption count: $ echo ksys_read:stacktrace > set_ftrace_filter <...>-410 [006] ..... 31.420396: => ksys_read => do_syscall_64 => entry_SYSCALL_64_after_hwframe Cc: stable@vger.kernel.org Fixes: 36590c50b2d07 ("tracing: Merge irqflags + preempt counter.") Link: https://lore.kernel.org/20250512094246.1167956-2-dolinux.peng@gmail.com Signed-off-by: pengdonglin Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 98ccf3f00c51..4e37a0f6aaa3 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -633,11 +633,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, static __always_inline void trace_stack(struct trace_array *tr) { - unsigned int trace_ctx; - - trace_ctx = tracing_gen_ctx(); - - __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); + __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP); } static void -- cgit v1.2.3 From 1d6c39c89f617c9fec6bbae166e25b16a014f7c8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 13 May 2025 11:50:32 -0400 Subject: ring-buffer: Fix persistent buffer when commit page is the reader page The ring buffer is made up of sub buffers (sometimes called pages as they are by default PAGE_SIZE). It has the following "pages": "tail page" - this is the page that the next write will write to "head page" - this is the page that the reader will swap the reader page with. "reader page" - This belongs to the reader, where it will swap the head page from the ring buffer so that the reader does not race with the writer. The writer may end up on the "reader page" if the ring buffer hasn't written more than one page, where the "tail page" and the "head page" are the same. The persistent ring buffer has meta data that points to where these pages exist so on reboot it can re-create the pointers to the cpu_buffer descriptor. But when the commit page is on the reader page, the logic is incorrect. The check to see if the commit page is on the reader page checked if the head page was the reader page, which would never happen, as the head page is always in the ring buffer. The correct check would be to test if the commit page is on the reader page. If that's the case, then it can exit out early as the commit page is only on the reader page when there's only one page of data in the buffer. There's no reason to iterate the ring buffer pages to find the "commit page" as it is already found. To trigger this bug: # echo 1 > /sys/kernel/tracing/instances/boot_mapped/events/syscalls/sys_enter_fchownat/enable # touch /tmp/x # chown sshd /tmp/x # reboot On boot up, the dmesg will have: Ring buffer meta [0] is from previous boot! Ring buffer meta [1] is from previous boot! Ring buffer meta [2] is from previous boot! Ring buffer meta [3] is from previous boot! Ring buffer meta [4] commit page not found Ring buffer meta [5] is from previous boot! Ring buffer meta [6] is from previous boot! Ring buffer meta [7] is from previous boot! Where the buffer on CPU 4 had a "commit page not found" error and that buffer is cleared and reset causing the output to be empty and the data lost. When it works correctly, it has: # cat /sys/kernel/tracing/instances/boot_mapped/trace_pipe <...>-1137 [004] ..... 998.205323: sys_enter_fchownat: __syscall_nr=0x104 (260) dfd=0xffffff9c (4294967196) filename=(0xffffc90000a0002c) user=0x3e8 (1000) group=0xffffffff (4294967295) flag=0x0 (0 Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250513115032.3e0b97f7@gandalf.local.home Fixes: 5f3b6e839f3ce ("ring-buffer: Validate boot range memory events") Reported-by: Tasos Sahanidis Tested-by: Tasos Sahanidis Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c0f877d39a24..3f9bf562beea 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1887,10 +1887,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) head_page = cpu_buffer->head_page; - /* If both the head and commit are on the reader_page then we are done. */ - if (head_page == cpu_buffer->reader_page && - head_page == cpu_buffer->commit_page) + /* If the commit_buffer is the reader page, update the commit page */ + if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) { + cpu_buffer->commit_page = cpu_buffer->reader_page; + /* Nothing more to do, the only page is the reader page */ goto done; + } /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { -- cgit v1.2.3