From b31ac41b59b6b6f1f6d426e2088e5c391bf89bf3 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 12 Jan 2026 15:46:36 +0000 Subject: dma/pool: Improve pool lookup If CONFIG_ZONE_DMA32 is enabled, but we have not allocated the corresponding atomic_pool_dma32, dma_guess_pool() may return the NULL value of that and fail a GFP_DMA32 allocation without trying to fall back to other pools which may exist. Furthermore, if no GFP_DMA pool exists, it is preferable to try GFP_DMA32 rather than immediately fall back to GFP_KERNEL with even less chance of success. Improve matters by encoding an explicit order of pool preference for each flag. Signed-off-by: Robin Murphy Tested-by: Vladimir Kondratiev Reviewed-by: Baoquan He Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/c846b1a2f43295cac926c7af2ce907f62baec518.1768230104.git.robin.murphy@arm.com --- kernel/dma/pool.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 26392badc36b..2645cfb5718b 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -224,10 +224,10 @@ postcore_initcall(dma_atomic_pool_init); static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp) { if (prev == NULL) { - if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) - return atomic_pool_dma32; - if (atomic_pool_dma && (gfp & GFP_DMA)) - return atomic_pool_dma; + if (gfp & GFP_DMA) + return atomic_pool_dma ?: atomic_pool_dma32 ?: atomic_pool_kernel; + if (gfp & GFP_DMA32) + return atomic_pool_dma32 ?: atomic_pool_dma ?: atomic_pool_kernel; return atomic_pool_kernel; } if (prev == atomic_pool_kernel) -- cgit v1.2.3 From c6ccd098807483762ccd726e1498bac5a71d0005 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 12 Jan 2026 15:46:38 +0000 Subject: dma/pool: Avoid allocating redundant pools On smaller systems, e.g. embedded arm64, it is common for all memory to end up in ZONE_DMA32 or even ZONE_DMA. In such cases it is redundant to allocate a nominal pool for an empty higher zone that just ends up coming from a lower zone that should already have its own pool anyway. We already have logic to skip allocating a ZONE_DMA pool when that is empty, so generalise that to save memory in the case of other zones too. Signed-off-by: Robin Murphy Tested-by: Vladimir Kondratiev Reviewed-by: Baoquan He Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/8ab8d8a620dee0109f33f5cb63d6bfeed35aac37.1768230104.git.robin.murphy@arm.com --- kernel/dma/pool.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 2645cfb5718b..c5da29ad010c 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -184,6 +184,12 @@ static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size, return pool; } +#ifdef CONFIG_ZONE_DMA32 +#define has_managed_dma32 has_managed_zone(ZONE_DMA32) +#else +#define has_managed_dma32 false +#endif + static int __init dma_atomic_pool_init(void) { int ret = 0; @@ -199,17 +205,20 @@ static int __init dma_atomic_pool_init(void) } INIT_WORK(&atomic_pool_work, atomic_pool_work_fn); - atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size, + /* All memory might be in the DMA zone(s) to begin with */ + if (has_managed_zone(ZONE_NORMAL)) { + atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size, GFP_KERNEL); - if (!atomic_pool_kernel) - ret = -ENOMEM; + if (!atomic_pool_kernel) + ret = -ENOMEM; + } if (has_managed_dma()) { atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size, GFP_KERNEL | GFP_DMA); if (!atomic_pool_dma) ret = -ENOMEM; } - if (IS_ENABLED(CONFIG_ZONE_DMA32)) { + if (has_managed_dma32) { atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size, GFP_KERNEL | GFP_DMA32); if (!atomic_pool_dma32) @@ -228,7 +237,7 @@ static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp) return atomic_pool_dma ?: atomic_pool_dma32 ?: atomic_pool_kernel; if (gfp & GFP_DMA32) return atomic_pool_dma32 ?: atomic_pool_dma ?: atomic_pool_kernel; - return atomic_pool_kernel; + return atomic_pool_kernel ?: atomic_pool_dma32 ?: atomic_pool_dma; } if (prev == atomic_pool_kernel) return atomic_pool_dma32 ? atomic_pool_dma32 : atomic_pool_dma; -- cgit v1.2.3 From 90f3c123247e9564f2ecf861946ec41ceaf5e198 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 6 Jan 2026 18:33:21 +0200 Subject: panic: only warn about deprecated panic_print on write access The panic_print_deprecated() warning is being triggered on both read and write operations to the panic_print parameter. This causes spurious warnings when users run 'sysctl -a' to list all sysctl values, since that command reads /proc/sys/kernel/panic_print and triggers the deprecation notice. Modify the handlers to only emit the deprecation warning when the parameter is actually being set: - sysctl_panic_print_handler(): check 'write' flag before warning. - panic_print_get(): remove the deprecation call entirely. This way, users are only warned when they actively try to use the deprecated parameter, not when passively querying system state. Link: https://lkml.kernel.org/r/20260106163321.83586-1-gal@nvidia.com Fixes: ee13240cd78b ("panic: add note that panic_print sysctl interface is deprecated") Fixes: 2683df6539cb ("panic: add note that 'panic_print' parameter is deprecated") Signed-off-by: Gal Pressman Reviewed-by: Mark Bloch Reviewed-by: Nimrod Oren Cc: Feng Tang Cc: Joel Granados Cc: Petr Mladek Cc: Signed-off-by: Andrew Morton --- kernel/panic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 0d52210a9e2b..0c20fcaae98a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -131,7 +131,8 @@ static int proc_taint(const struct ctl_table *table, int write, static int sysctl_panic_print_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - panic_print_deprecated(); + if (write) + panic_print_deprecated(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } @@ -1014,7 +1015,6 @@ static int panic_print_set(const char *val, const struct kernel_param *kp) static int panic_print_get(char *val, const struct kernel_param *kp) { - panic_print_deprecated(); return param_get_ulong(val, kp); } -- cgit v1.2.3 From e806f7dde8ba28bc72a7a0898589cac79f6362ac Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 20 Jan 2026 07:55:55 +0100 Subject: timekeeping: Adjust the leap state for the correct auxiliary timekeeper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When __do_ajdtimex() was introduced to handle adjtimex for any timekeeper, this reference to tk_core was not updated. When called on an auxiliary timekeeper, the core timekeeper would be updated incorrectly. This gets caught by the lock debugging diagnostics because the timekeepers sequence lock gets written to without holding its associated spinlock: WARNING: include/linux/seqlock.h:226 at __do_adjtimex+0x394/0x3b0, CPU#2: test/125 aux_clock_adj (kernel/time/timekeeping.c:2979) __do_sys_clock_adjtime (kernel/time/posix-timers.c:1161 kernel/time/posix-timers.c:1173) do_syscall_64 (arch/x86/entry/syscall_64.c:63 (discriminator 1) arch/x86/entry/syscall_64.c:94 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:131) Update the correct auxiliary timekeeper. Fixes: 775f71ebedd3 ("timekeeping: Make do_adjtimex() reusable") Fixes: ecf3e7030491 ("timekeeping: Provide adjtimex() for auxiliary clocks") Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260120-timekeeper-auxclock-leapstate-v1-1-5b358c6b3cfd@linutronix.de --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3ec3daa4acab..91fa2003351c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2735,7 +2735,7 @@ static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc, timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); result->clock_set = true; } else { - tk_update_leap_state_all(&tk_core); + tk_update_leap_state_all(tkd); } /* Update the multiplier immediately if frequency was set directly */ -- cgit v1.2.3 From c06343be0b4e03fe319910dd7a5d5b9929e1c0cb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 17 Dec 2025 18:21:05 +0100 Subject: clocksource: Reduce watchdog readout delay limit to prevent false positives The "valid" readout delay between the two reads of the watchdog is larger than the valid delta between the resulting watchdog and clocksource intervals, which results in false positive watchdog results. Assume TSC is the clocksource and HPET is the watchdog and both have a uncertainty margin of 250us (default). The watchdog readout does: 1) wdnow = read(HPET); 2) csnow = read(TSC); 3) wdend = read(HPET); The valid window for the delta between #1 and #3 is calculated by the uncertainty margins of the watchdog and the clocksource: m = 2 * watchdog.uncertainty_margin + cs.uncertainty margin; which results in 750us for the TSC/HPET case. The actual interval comparison uses a smaller margin: m = watchdog.uncertainty_margin + cs.uncertainty margin; which results in 500us for the TSC/HPET case. That means the following scenario will trigger the watchdog: Watchdog cycle N: 1) wdnow[N] = read(HPET); 2) csnow[N] = read(TSC); 3) wdend[N] = read(HPET); Assume the delay between #1 and #2 is 100us and the delay between #1 and Watchdog cycle N + 1: 4) wdnow[N + 1] = read(HPET); 5) csnow[N + 1] = read(TSC); 6) wdend[N + 1] = read(HPET); If the delay between #4 and #6 is within the 750us margin then any delay between #4 and #5 which is larger than 600us will fail the interval check and mark the TSC unstable because the intervals are calculated against the previous value: wd_int = wdnow[N + 1] - wdnow[N]; cs_int = csnow[N + 1] - csnow[N]; Putting the above delays in place this results in: cs_int = (wdnow[N + 1] + 610us) - (wdnow[N] + 100us); -> cs_int = wd_int + 510us; which is obviously larger than the allowed 500us margin and results in marking TSC unstable. Fix this by using the same margin as the interval comparison. If the delay between two watchdog reads is larger than that, then the readout was either disturbed by interconnect congestion, NMIs or SMIs. Fixes: 4ac1dd3245b9 ("clocksource: Set cs_watchdog_read() checks based on .uncertainty_margin") Reported-by: Daniel J Blueman Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Tested-by: Paul E. McKenney Link: https://lore.kernel.org/lkml/20250602223251.496591-1-daniel@quora.org/ Link: https://patch.msgid.link/87bjjxc9dq.ffs@tglx --- kernel/time/clocksource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index a1890a073196..df7194961658 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -252,7 +252,7 @@ enum wd_read_status { static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) { - int64_t md = 2 * watchdog->uncertainty_margin; + int64_t md = watchdog->uncertainty_margin; unsigned int nretries, max_retries; int64_t wd_delay, wd_seq_delay; u64 wd_end, wd_end2; -- cgit v1.2.3 From d06bf78e55d5159c1b00072e606ab924ffbbad35 Mon Sep 17 00:00:00 2001 From: Will Rosenberg Date: Mon, 19 Jan 2026 11:49:56 -0700 Subject: perf: Fix refcount warning on event->mmap_count increment When calling refcount_inc(&event->mmap_count) inside perf_mmap_rb(), the following warning is triggered: refcount_t: addition on 0; use-after-free. WARNING: lib/refcount.c:25 PoC: struct perf_event_attr attr = {0}; int fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); mmap(NULL, 0x3000, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); int victim = syscall(__NR_perf_event_open, &attr, 0, -1, fd, PERF_FLAG_FD_OUTPUT); mmap(NULL, 0x3000, PROT_READ | PROT_WRITE, MAP_SHARED, victim, 0); This occurs when creating a group member event with the flag PERF_FLAG_FD_OUTPUT. The group leader should be mmap-ed and then mmap-ing the event triggers the warning. Since the event has copied the output_event in perf_event_set_output(), event->rb is set. As a result, perf_mmap_rb() calls refcount_inc(&event->mmap_count) when event->mmap_count = 0. Disallow the case when event->mmap_count = 0. This also prevents two events from updating the same user_page. Fixes: 448f97fba901 ("perf: Convert mmap() refcounts to refcount_t") Suggested-by: Peter Zijlstra Signed-off-by: Will Rosenberg Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260119184956.801238-1-whrosenb@asu.edu --- kernel/events/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5b5cb620499e..a0fa488bce84 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6997,6 +6997,15 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, if (data_page_nr(event->rb) != nr_pages) return -EINVAL; + /* + * If this event doesn't have mmap_count, we're attempting to + * create an alias of another event's mmap(); this would mean + * both events will end up scribbling the same user_page; + * which makes no sense. + */ + if (!refcount_read(&event->mmap_count)) + return -EBUSY; + if (refcount_inc_not_zero(&event->rb->mmap_count)) { /* * Success -- managed to mmap() the same buffer -- cgit v1.2.3 From 98c88dc8a1ace642d9021b103b28cba7b51e3abc Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 21 Jan 2026 17:33:17 +0100 Subject: sched/fair: Fix pelt clock sync when entering idle Samuel and Alex reported regressions of the util_avg of RT rq with commit 17e3e88ed0b6 ("sched/fair: Fix pelt lost idle time detection"). It happens that fair is updating and syncing the pelt clock with task one when pick_next_task_fair() fails to pick a task but before the prev scheduling class got a chance to update its pelt signals. Move update_idle_rq_clock_pelt() in set_next_task_idle() which is called after prev class has been called. Fixes: 17e3e88ed0b6 ("sched/fair: Fix pelt lost idle time detection") Closes: https://lore.kernel.org/all/CAG2KctpO6VKS6GN4QWDji0t92_gNBJ7HjjXrE+6H+RwRXt=iLg@mail.gmail.com/ Closes: https://lore.kernel.org/all/8cf19bf0e0054dcfed70e9935029201694f1bb5a.camel@mediatek.com/ Reported-by: Samuel Wu Reported-by: Alex Hoh Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Tested-by: Samuel Wu Tested-by: Alex Hoh Link: https://patch.msgid.link/20260121163317.505635-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 6 ------ kernel/sched/idle.c | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e71302282671..a148c61a8085 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8995,12 +8995,6 @@ idle: goto again; } - /* - * rq is about to be idle, check if we need to update the - * lost_idle_time of clock_pelt - */ - update_idle_rq_clock_pelt(rq); - return NULL; } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c174afe1dd17..abf8f15d60c9 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -468,6 +468,12 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir scx_update_idle(rq, true, true); schedstat_inc(rq->sched_goidle); next->se.exec_start = rq_clock_task(rq); + + /* + * rq is about to be idle, check if we need to update the + * lost_idle_time of clock_pelt + */ + update_idle_rq_clock_pelt(rq); } struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf) -- cgit v1.2.3 From 4f70f106bca1a56bd66d00830ac91680bd754974 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 20 Jan 2026 11:33:35 +0000 Subject: sched/fair: Disable scheduler feature NEXT_BUDDY NEXT_BUDDY was disabled with the introduction of EEVDF and enabled again after NEXT_BUDDY was rewritten for EEVDF by commit e837456fdca8 ("sched/fair: Reimplement NEXT_BUDDY to align with EEVDF goals"). It was not expected that this would be a universal win without a crystal ball instruction but the reported regressions are a concern [1][2] even if gains were also reported. Specifically; o mysql with client/server running on different servers regresses o specjbb reports lower peak metrics o daytrader regresses The mysql is realistic and a concern. It needs to be confirmed if specjbb is simply shifting the point where peak performance is measured but still a concern. daytrader is considered to be representative of a real workload. Access to test machines is currently problematic for verifying any fix to this problem. Disable NEXT_BUDDY for now by default until the root causes are addressed. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Tested-by: Madadi Vineeth Reddy Link: https://lore.kernel.org/lkml/4b96909a-f1ac-49eb-b814-97b8adda6229@arm.com [1] Link: https://lore.kernel.org/lkml/ec3ea66f-3a0d-4b5a-ab36-ce778f159b5b@linux.ibm.com [2] Link: https://patch.msgid.link/fyqsk63pkoxpeaclyqsm5nwtz3dyejplr7rg6p74xwemfzdzuu@7m7xhs5aqpqw --- kernel/sched/features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 980d92bab8ab..136a6584be79 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -29,7 +29,7 @@ SCHED_FEAT(PREEMPT_SHORT, true) * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. */ -SCHED_FEAT(NEXT_BUDDY, true) +SCHED_FEAT(NEXT_BUDDY, false) /* * Allow completely ignoring cfs_rq->next; which can be set from various -- cgit v1.2.3 From 15257cc2f905dbf5813c0bfdd3c15885f28093c4 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 23 Jan 2026 11:28:58 +0100 Subject: sched/fair: Revert force wakeup preemption This agressively bypasses run_to_parity and slice protection with the assumpiton that this is what waker wants but there is no garantee that the wakee will be the next to run. It is a better choice to use yield_to_task or WF_SYNC in such case. This increases the number of resched and preemption because a task becomes quickly "ineligible" when it runs; We update the task vruntime periodically and before the task exhausted its slice or at least quantum. Example: 2 tasks A and B wake up simultaneously with lag = 0. Both are eligible. Task A runs 1st and wakes up task C. Scheduler updates task A's vruntime which becomes greater than average runtime as all others have a lag == 0 and didn't run yet. Now task A is ineligible because it received more runtime than the other task but it has not yet exhausted its slice nor a min quantum. We force preemption, disable protection but Task B will run 1st not task C. Sidenote, DELAY_ZERO increases this effect by clearing positive lag at wake up. Fixes: e837456fdca8 ("sched/fair: Reimplement NEXT_BUDDY to align with EEVDF goals") Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260123102858.52428-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a148c61a8085..3eaeceda71b0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8828,16 +8828,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if ((wake_flags & WF_FORK) || pse->sched_delayed) return; - /* - * If @p potentially is completing work required by current then - * consider preemption. - * - * Reschedule if waker is no longer eligible. */ - if (in_task() && !entity_eligible(cfs_rq, se)) { - preempt_action = PREEMPT_WAKEUP_RESCHED; - goto preempt; - } - /* Prefer picking wakee soon if appropriate. */ if (sched_feat(NEXT_BUDDY) && set_preempt_buddy(cfs_rq, wake_flags, pse, se)) { -- cgit v1.2.3 From 90f9f5d64cae4e72defd96a2a22760173cb3c9ec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Jan 2026 19:48:24 -0500 Subject: tracing: Fix crash on synthetic stacktrace field usage When creating a synthetic event based on an existing synthetic event that had a stacktrace field and the new synthetic event used that field a kernel crash occurred: ~# cd /sys/kernel/tracing ~# echo 's:stack unsigned long stack[];' > dynamic_events ~# echo 'hist:keys=prev_pid:s0=common_stacktrace if prev_state & 3' >> events/sched/sched_switch/trigger ~# echo 'hist:keys=next_pid:s1=$s0:onmatch(sched.sched_switch).trace(stack,$s1)' >> events/sched/sched_switch/trigger The above creates a synthetic event that takes a stacktrace when a task schedules out in a non-running state and passes that stacktrace to the sched_switch event when that task schedules back in. It triggers the "stack" synthetic event that has a stacktrace as its field (called "stack"). ~# echo 's:syscall_stack s64 id; unsigned long stack[];' >> dynamic_events ~# echo 'hist:keys=common_pid:s2=stack' >> events/synthetic/stack/trigger ~# echo 'hist:keys=common_pid:s3=$s2,i0=id:onmatch(synthetic.stack).trace(syscall_stack,$i0,$s3)' >> events/raw_syscalls/sys_exit/trigger The above makes another synthetic event called "syscall_stack" that attaches the first synthetic event (stack) to the sys_exit trace event and records the stacktrace from the stack event with the id of the system call that is exiting. When enabling this event (or using it in a historgram): ~# echo 1 > events/synthetic/syscall_stack/enable Produces a kernel crash! BUG: unable to handle page fault for address: 0000000000400010 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP PTI CPU: 6 UID: 0 PID: 1257 Comm: bash Not tainted 6.16.3+deb14-amd64 #1 PREEMPT(lazy) Debian 6.16.3-1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.17.0-debian-1.17.0-1 04/01/2014 RIP: 0010:trace_event_raw_event_synth+0x90/0x380 Code: c5 00 00 00 00 85 d2 0f 84 e1 00 00 00 31 db eb 34 0f 1f 00 66 66 2e 0f 1f 84 00 00 00 00 00 66 66 2e 0f 1f 84 00 00 00 00 00 <49> 8b 04 24 48 83 c3 01 8d 0c c5 08 00 00 00 01 cd 41 3b 5d 40 0f RSP: 0018:ffffd2670388f958 EFLAGS: 00010202 RAX: ffff8ba1065cc100 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: fffff266ffda7b90 RDI: ffffd2670388f9b0 RBP: 0000000000000010 R08: ffff8ba104e76000 R09: ffffd2670388fa50 R10: ffff8ba102dd42e0 R11: ffffffff9a908970 R12: 0000000000400010 R13: ffff8ba10a246400 R14: ffff8ba10a710220 R15: fffff266ffda7b90 FS: 00007fa3bc63f740(0000) GS:ffff8ba2e0f48000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000400010 CR3: 0000000107f9e003 CR4: 0000000000172ef0 Call Trace: ? __tracing_map_insert+0x208/0x3a0 action_trace+0x67/0x70 event_hist_trigger+0x633/0x6d0 event_triggers_call+0x82/0x130 trace_event_buffer_commit+0x19d/0x250 trace_event_raw_event_sys_exit+0x62/0xb0 syscall_exit_work+0x9d/0x140 do_syscall_64+0x20a/0x2f0 ? trace_event_raw_event_sched_switch+0x12b/0x170 ? save_fpregs_to_fpstate+0x3e/0x90 ? _raw_spin_unlock+0xe/0x30 ? finish_task_switch.isra.0+0x97/0x2c0 ? __rseq_handle_notify_resume+0xad/0x4c0 ? __schedule+0x4b8/0xd00 ? restore_fpregs_from_fpstate+0x3c/0x90 ? switch_fpu_return+0x5b/0xe0 ? do_syscall_64+0x1ef/0x2f0 ? do_fault+0x2e9/0x540 ? __handle_mm_fault+0x7d1/0xf70 ? count_memcg_events+0x167/0x1d0 ? handle_mm_fault+0x1d7/0x2e0 ? do_user_addr_fault+0x2c3/0x7f0 entry_SYSCALL_64_after_hwframe+0x76/0x7e The reason is that the stacktrace field is not labeled as such, and is treated as a normal field and not as a dynamic event that it is. In trace_event_raw_event_synth() the event is field is still treated as a dynamic array, but the retrieval of the data is considered a normal field, and the reference is just the meta data: // Meta data is retrieved instead of a dynamic array str_val = (char *)(long)var_ref_vals[val_idx]; // Then when it tries to process it: len = *((unsigned long *)str_val) + 1; It triggers a kernel page fault. To fix this, first when defining the fields of the first synthetic event, set the filter type to FILTER_STACKTRACE. This is used later by the second synthetic event to know that this field is a stacktrace. When creating the field of the new synthetic event, have it use this FILTER_STACKTRACE to know to create a stacktrace field to copy the stacktrace into. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Tom Zanussi Link: https://patch.msgid.link/20260122194824.6905a38e@gandalf.local.home Fixes: 00cf3d672a9d ("tracing: Allow synthetic events to pass around stacktraces") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 9 +++++++++ kernel/trace/trace_events_synth.c | 8 +++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5e6e70540eef..c97bb2fda5c0 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2057,6 +2057,15 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING; else hist_field->fn_num = HIST_FIELD_FN_PSTRING; + } else if (field->filter_type == FILTER_STACKTRACE) { + flags |= HIST_FIELD_FL_STACKTRACE; + + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->type = kstrdup_const(field->type, GFP_KERNEL); + if (!hist_field->type) + goto free; + + hist_field->fn_num = HIST_FIELD_FN_STACK; } else { hist_field->size = field->size; hist_field->is_signed = field->is_signed; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 4554c458b78c..45c187e77e21 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -130,7 +130,9 @@ static int synth_event_define_fields(struct trace_event_call *call) struct synth_event *event = call->data; unsigned int i, size, n_u64; char *name, *type; + int filter_type; bool is_signed; + bool is_stack; int ret = 0; for (i = 0, n_u64 = 0; i < event->n_fields; i++) { @@ -138,8 +140,12 @@ static int synth_event_define_fields(struct trace_event_call *call) is_signed = event->fields[i]->is_signed; type = event->fields[i]->type; name = event->fields[i]->name; + is_stack = event->fields[i]->is_stack; + + filter_type = is_stack ? FILTER_STACKTRACE : FILTER_OTHER; + ret = trace_define_field(call, type, name, offset, size, - is_signed, FILTER_OTHER); + is_signed, filter_type); if (ret) break; -- cgit v1.2.3 From 00f13e28a9c3acd40f0551cde7e9d2d1a41585bf Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 7 Jan 2026 16:26:25 -0800 Subject: tracing: Avoid possible signed 64-bit truncation 64-bit truncation to 32-bit can result in the sign of the truncated value changing. The cmp_mod_entry is used in bsearch and so the truncation could result in an invalid search order. This would only happen were the addresses more than 2GB apart and so unlikely, but let's fix the potentially broken compare anyway. Cc: Mathieu Desnoyers Link: https://patch.msgid.link/20260108002625.333331-1-irogers@google.com Signed-off-by: Ian Rogers Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index baec63134ab6..8bd4ec08fb36 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6115,10 +6115,10 @@ static int cmp_mod_entry(const void *key, const void *pivot) unsigned long addr = (unsigned long)key; const struct trace_mod_entry *ent = pivot; - if (addr >= ent[0].mod_addr && addr < ent[1].mod_addr) - return 0; - else - return addr - ent->mod_addr; + if (addr < ent[0].mod_addr) + return -1; + + return addr >= ent[1].mod_addr; } /** -- cgit v1.2.3 From c9703d17d2c86eda38fe4917ca70c27ec9dbe162 Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Mon, 12 Jan 2026 10:16:01 +0800 Subject: function_graph: Fix args pointer mismatch in print_graph_retval() When funcgraph-args and funcgraph-retaddr are both enabled, many kernel functions display invalid parameters in trace logs. The issue occurs because print_graph_retval() passes a mismatched args pointer to print_function_args(). Fix this by retrieving the correct args pointer using the FGRAPH_ENTRY_ARGS() macro. Link: https://patch.msgid.link/20260112021601.1300479-1-dolinux.peng@gmail.com Fixes: f83ac7544fbf ("function_graph: Enable funcgraph-args and funcgraph-retaddr to work simultaneously") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Donglin Peng Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b1e9c9913309..1de6f1573621 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -901,7 +901,7 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr trace_seq_printf(s, "%ps", func); if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) { - print_function_args(s, entry->args, (unsigned long)func); + print_function_args(s, FGRAPH_ENTRY_ARGS(entry), (unsigned long)func); trace_seq_putc(s, ';'); } else trace_seq_puts(s, "();"); -- cgit v1.2.3