From 17a1a107d0e96c1b7eef875de46f1d953c557f88 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 23 Sep 2025 09:04:58 -0400 Subject: tracing: Replace syscall RCU pointer assignment with READ/WRITE_ONCE() The syscall events are pseudo events that hook to the raw syscalls. The ftrace_syscall_enter/exit() callback is called by the raw_syscall enter/exit tracepoints respectively whenever any of the syscall events are enabled. The trace_array has an array of syscall "files" that correspond to the system calls based on their __NR_SYSCALL number. The array is read and if there's a pointer to a trace_event_file then it is considered enabled and if it is NULL that syscall event is considered disabled. Currently it uses an rcu_dereference_sched() to get this pointer and a rcu_assign_ptr() or RCU_INIT_POINTER() to write to it. This is unnecessary as the file pointer will not go away outside the synchronization of the tracepoint logic itself. And this code adds no extra RCU synchronization that uses this. Replace these functions with a simple READ_ONCE() and WRITE_ONCE() which is all they need. This will also allow this code to not depend on preemption being disabled as system call tracepoints are now allowed to fault. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Link: https://lore.kernel.org/20250923130713.594320290@kernel.org Reviewed-by: Paul E. McKenney Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 4 ++-- kernel/trace/trace_syscalls.c | 14 ++++++-------- 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5f4bed5842f9..85eabb454bee 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -380,8 +380,8 @@ struct trace_array { #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; - struct trace_event_file __rcu *enter_syscall_files[NR_syscalls]; - struct trace_event_file __rcu *exit_syscall_files[NR_syscalls]; + struct trace_event_file *enter_syscall_files[NR_syscalls]; + struct trace_event_file *exit_syscall_files[NR_syscalls]; #endif int stop_count; int clock_id; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 46aab0ab9350..3a0b65f89130 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -310,8 +310,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; - /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ - trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); + trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]); if (!trace_file) return; @@ -356,8 +355,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; - /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ - trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); + trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]); if (!trace_file) return; @@ -393,7 +391,7 @@ static int reg_event_syscall_enter(struct trace_event_file *file, if (!tr->sys_refcount_enter) ret = register_trace_sys_enter(ftrace_syscall_enter, tr); if (!ret) { - rcu_assign_pointer(tr->enter_syscall_files[num], file); + WRITE_ONCE(tr->enter_syscall_files[num], file); tr->sys_refcount_enter++; } mutex_unlock(&syscall_trace_lock); @@ -411,7 +409,7 @@ static void unreg_event_syscall_enter(struct trace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_enter--; - RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL); + WRITE_ONCE(tr->enter_syscall_files[num], NULL); if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); @@ -431,7 +429,7 @@ static int reg_event_syscall_exit(struct trace_event_file *file, if (!tr->sys_refcount_exit) ret = register_trace_sys_exit(ftrace_syscall_exit, tr); if (!ret) { - rcu_assign_pointer(tr->exit_syscall_files[num], file); + WRITE_ONCE(tr->exit_syscall_files[num], file); tr->sys_refcount_exit++; } mutex_unlock(&syscall_trace_lock); @@ -449,7 +447,7 @@ static void unreg_event_syscall_exit(struct trace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_exit--; - RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL); + WRITE_ONCE(tr->exit_syscall_files[num], NULL); if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); -- cgit v1.2.3 From 3add2d34bdfb1caab1d3f28ba0160f52dcff9353 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 23 Sep 2025 09:04:59 -0400 Subject: tracing: Have syscall trace events show "0x" for values greater than 10 Currently the syscall trace events show each value as hexadecimal, but without adding "0x" it can be confusing: sys_write(fd: 4, buf: 0x55c4a1fa9270, count: 44) Looks like the above write wrote 44 bytes, when in reality it wrote 68 bytes. Add a "0x" for all values greater or equal to 10 to remove the ambiguity. For values less than 10, leave off the "0x" as that just adds noise to the output. Also change the iterator to check if "i" is nonzero and print the ", " delimiter at the start, then adding the logic to the trace_seq_printf() at the end. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Link: https://lore.kernel.org/20250923130713.764558957@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_syscalls.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 3a0b65f89130..0f932b22f9ec 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -153,14 +153,20 @@ print_syscall_enter(struct trace_iterator *iter, int flags, if (trace_seq_has_overflowed(s)) goto end; + if (i) + trace_seq_puts(s, ", "); + /* parameter types */ if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) trace_seq_printf(s, "%s ", entry->types[i]); /* parameter values */ - trace_seq_printf(s, "%s: %lx%s", entry->args[i], - trace->args[i], - i == entry->nb_args - 1 ? "" : ", "); + if (trace->args[i] < 10) + trace_seq_printf(s, "%s: %lu", entry->args[i], + trace->args[i]); + else + trace_seq_printf(s, "%s: 0x%lx", entry->args[i], + trace->args[i]); } trace_seq_putc(s, ')'); -- cgit v1.2.3 From 09da59344a5a2abb5b2f209cf149421d7d105ebc Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Sun, 17 Aug 2025 16:47:25 +0800 Subject: tracing: Use vmalloc_array() to improve code Remove array_size() calls and replace vmalloc() with vmalloc_array() in tracing_map_sort_entries(). vmalloc_array() is optimized better, uses fewer instructions, and handles overflow more concisely[1]. [1]: https://lore.kernel.org/lkml/abc66ec5-85a4-47e1-9759-2f60ab111971@vivo.com/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250817084725.59477-1-rongqianfeng@vivo.com Signed-off-by: Qianfeng Rong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/tracing_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 1921ade45be3..7f8da4dab69d 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -1076,7 +1076,7 @@ int tracing_map_sort_entries(struct tracing_map *map, struct tracing_map_sort_entry *sort_entry, **entries; int i, n_entries, ret; - entries = vmalloc(array_size(sizeof(sort_entry), map->max_elts)); + entries = vmalloc_array(map->max_elts, sizeof(sort_entry)); if (!entries) return -ENOMEM; -- cgit v1.2.3 From 1d67d67a8c88db99ebf5b1323c238929c5fa8483 Mon Sep 17 00:00:00 2001 From: Fushuai Wang Date: Mon, 11 Aug 2025 14:41:58 +0800 Subject: tracing/osnoise: Use for_each_online_cpu() instead of for_each_cpu() Replace the opencoded for_each_cpu(cpu, cpu_online_mask) loop with the more readable and equivalent for_each_online_cpu(cpu) macro. Link: https://lore.kernel.org/20250811064158.2456-1-wangfushuai@baidu.com Signed-off-by: Fushuai Wang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index fd259da0aa64..4cb464894faf 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -271,7 +271,7 @@ static inline void tlat_var_reset(void) * So far, all the values are initialized as 0, so * zeroing the structure is perfect. */ - for_each_cpu(cpu, cpu_online_mask) { + for_each_online_cpu(cpu) { tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); if (tlat_var->kthread) hrtimer_cancel(&tlat_var->timer); @@ -295,7 +295,7 @@ static inline void osn_var_reset(void) * So far, all the values are initialized as 0, so * zeroing the structure is perfect. */ - for_each_cpu(cpu, cpu_online_mask) { + for_each_online_cpu(cpu) { osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); memset(osn_var, 0, sizeof(*osn_var)); } -- cgit v1.2.3 From 8613a55ac57baf40e54633eab00c820515196113 Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Mon, 25 Aug 2025 20:31:59 +0800 Subject: tracing: Remove redundant 0 value initialization The saved_cmdlines_buffer struct is already zeroed by memset(). It's redundant to initialize s->cmdline_idx to 0. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250825123200.306272-1-liaoyuanhong@vivo.com Signed-off-by: Liao Yuanhong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_sched_switch.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index cb49f7279dc8..518dfc74347a 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -224,7 +224,6 @@ static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) /* Place map_cmdline_to_pid array right after saved_cmdlines */ s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN]; - s->cmdline_idx = 0; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, -- cgit v1.2.3 From 70bd70c303ad4a00b299cb2468bc6475ff90b5b1 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 11:10:40 +0200 Subject: tracing: replace use of system_wq with system_percpu_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_wq is a per-CPU worqueue, yet nothing in its name tells about that CPU affinity constraint, which is very often not required by users. Make it clear by adding a system_percpu_wq. queue_work() / queue_delayed_work() mod_delayed_work() will now use the new per-cpu wq: whether the user still stick on the old name a warn will be printed along a wq redirect to the new one. This patch add the new system_percpu_wq except for mm, fs and net subsystem, whom are handled in separated patches. The old wq will be kept for a few release cylces. Cc: Lai Jiangshan Cc: Frederic Weisbecker Cc: Sebastian Andrzej Siewior Cc: Michal Hocko Cc: Masami Hiramatsu Link: https://lore.kernel.org/20250905091040.109772-2-marco.crivellari@suse.com Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index af42aaa3d172..3169182229ad 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -835,7 +835,7 @@ void user_event_mm_remove(struct task_struct *t) * so we use a work queue after call_rcu() to run within. */ INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put); - queue_rcu_work(system_wq, &mm->put_rwork); + queue_rcu_work(system_percpu_wq, &mm->put_rwork); } void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) -- cgit v1.2.3 From 4099b98203d6b33d990586542fa5beee408032a3 Mon Sep 17 00:00:00 2001 From: Vladimir Riabchun Date: Fri, 12 Sep 2025 13:28:55 +0200 Subject: ftrace: Fix softlockup in ftrace_module_enable A soft lockup was observed when loading amdgpu module. If a module has a lot of tracable functions, multiple calls to kallsyms_lookup can spend too much time in RCU critical section and with disabled preemption, causing kernel panic. This is the same issue that was fixed in commit d0b24b4e91fc ("ftrace: Prevent RCU stall on PREEMPT_VOLUNTARY kernels") and commit 42ea22e754ba ("ftrace: Add cond_resched() to ftrace_graph_set_hash()"). Fix it the same way by adding cond_resched() in ftrace_module_enable. Link: https://lore.kernel.org/aMQD9_lxYmphT-up@vova-pc Signed-off-by: Vladimir Riabchun Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a69067367c29..42bd2ba68a82 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7535,6 +7535,8 @@ void ftrace_module_enable(struct module *mod) if (!within_module(rec->ip, mod)) break; + cond_resched(); + /* Weak functions should still be ignored */ if (!test_for_valid_rec(rec)) { /* Clear all other flags. Should not be enabled anyway */ -- cgit v1.2.3 From 2378a191f440a06e4c60fb8a50f4cb708c10ba40 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Wed, 24 Sep 2025 13:38:09 +0200 Subject: tracing: Ensure optimized hashing works MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If ever PID_MAX_DEFAULT changes, it must be compatible with tracing hashmaps assumptions. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250924113810.2433478-1-mkoutny@suse.com Link: https://lore.kernel.org/r/20240409110126.651e94cb@gandalf.local.home/ Signed-off-by: Michal Koutný Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_sched_switch.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 518dfc74347a..c46d584ded3b 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -247,6 +247,8 @@ int trace_save_cmdline(struct task_struct *tsk) if (!tsk->pid) return 1; + BUILD_BUG_ON(!is_power_of_2(PID_MAX_DEFAULT)); + tpid = tsk->pid & (PID_MAX_DEFAULT - 1); /* -- cgit v1.2.3 From 61e19cd2e5c5235326a13a68df1a2f8ec4eeed7b Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 29 Sep 2025 07:32:38 -0400 Subject: tracing: Fix lock imbalance in s_start() memory allocation failure path When s_start() fails to allocate memory for set_event_iter, it returns NULL before acquiring event_mutex. However, the corresponding s_stop() function always tries to unlock the mutex, causing a lock imbalance warning: WARNING: bad unlock balance detected! 6.17.0-rc7-00175-g2b2e0c04f78c #7 Not tainted ------------------------------------- syz.0.85611/376514 is trying to release lock (event_mutex) at: [] traverse.part.0.constprop.0+0x2c4/0x650 fs/seq_file.c:131 but there are no more locks to release! The issue was introduced by commit b355247df104 ("tracing: Cache ':mod:' events for modules not loaded yet") which added the kzalloc() allocation before the mutex lock, creating a path where s_start() could return without locking the mutex while s_stop() would still try to unlock it. Fix this by unconditionally acquiring the mutex immediately after allocation, regardless of whether the allocation succeeded. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20250929113238.3722055-1-sashal@kernel.org Fixes: b355247df104 ("tracing: Cache ":mod:" events for modules not loaded yet") Signed-off-by: Sasha Levin Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9f3e9537417d..e00da4182deb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1629,11 +1629,10 @@ static void *s_start(struct seq_file *m, loff_t *pos) loff_t l; iter = kzalloc(sizeof(*iter), GFP_KERNEL); + mutex_lock(&event_mutex); if (!iter) return NULL; - mutex_lock(&event_mutex); - iter->type = SET_EVENT_FILE; iter->file = list_entry(&tr->events, struct trace_event_file, list); -- cgit v1.2.3