From b31ac41b59b6b6f1f6d426e2088e5c391bf89bf3 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 12 Jan 2026 15:46:36 +0000
Subject: dma/pool: Improve pool lookup

If CONFIG_ZONE_DMA32 is enabled, but we have not allocated the
corresponding atomic_pool_dma32, dma_guess_pool() may return the NULL
value of that and fail a GFP_DMA32 allocation without trying to fall
back to other pools which may exist. Furthermore, if no GFP_DMA pool
exists, it is preferable to try GFP_DMA32 rather than immediately fall
back to GFP_KERNEL with even less chance of success. Improve matters
by encoding an explicit order of pool preference for each flag.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Vladimir Kondratiev <vladimir.kondratiev@mobileye.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/c846b1a2f43295cac926c7af2ce907f62baec518.1768230104.git.robin.murphy@arm.com
---
 kernel/dma/pool.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 26392badc36b..2645cfb5718b 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -224,10 +224,10 @@ postcore_initcall(dma_atomic_pool_init);
 static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
 {
 	if (prev == NULL) {
-		if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
-			return atomic_pool_dma32;
-		if (atomic_pool_dma && (gfp & GFP_DMA))
-			return atomic_pool_dma;
+		if (gfp & GFP_DMA)
+			return atomic_pool_dma ?: atomic_pool_dma32 ?: atomic_pool_kernel;
+		if (gfp & GFP_DMA32)
+			return atomic_pool_dma32 ?: atomic_pool_dma ?: atomic_pool_kernel;
 		return atomic_pool_kernel;
 	}
 	if (prev == atomic_pool_kernel)
-- 
cgit v1.2.3


From c6ccd098807483762ccd726e1498bac5a71d0005 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 12 Jan 2026 15:46:38 +0000
Subject: dma/pool: Avoid allocating redundant pools

On smaller systems, e.g. embedded arm64, it is common for all memory
to end up in ZONE_DMA32 or even ZONE_DMA. In such cases it is redundant
to allocate a nominal pool for an empty higher zone that just ends up
coming from a lower zone that should already have its own pool anyway.
We already have logic to skip allocating a ZONE_DMA pool when that is
empty, so generalise that to save memory in the case of other zones too.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Vladimir Kondratiev <vladimir.kondratiev@mobileye.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/8ab8d8a620dee0109f33f5cb63d6bfeed35aac37.1768230104.git.robin.murphy@arm.com
---
 kernel/dma/pool.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 2645cfb5718b..c5da29ad010c 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -184,6 +184,12 @@ static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size,
 	return pool;
 }
 
+#ifdef CONFIG_ZONE_DMA32
+#define has_managed_dma32 has_managed_zone(ZONE_DMA32)
+#else
+#define has_managed_dma32 false
+#endif
+
 static int __init dma_atomic_pool_init(void)
 {
 	int ret = 0;
@@ -199,17 +205,20 @@ static int __init dma_atomic_pool_init(void)
 	}
 	INIT_WORK(&atomic_pool_work, atomic_pool_work_fn);
 
-	atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size,
+	/* All memory might be in the DMA zone(s) to begin with */
+	if (has_managed_zone(ZONE_NORMAL)) {
+		atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size,
 						    GFP_KERNEL);
-	if (!atomic_pool_kernel)
-		ret = -ENOMEM;
+		if (!atomic_pool_kernel)
+			ret = -ENOMEM;
+	}
 	if (has_managed_dma()) {
 		atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
 						GFP_KERNEL | GFP_DMA);
 		if (!atomic_pool_dma)
 			ret = -ENOMEM;
 	}
-	if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
+	if (has_managed_dma32) {
 		atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
 						GFP_KERNEL | GFP_DMA32);
 		if (!atomic_pool_dma32)
@@ -228,7 +237,7 @@ static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
 			return atomic_pool_dma ?: atomic_pool_dma32 ?: atomic_pool_kernel;
 		if (gfp & GFP_DMA32)
 			return atomic_pool_dma32 ?: atomic_pool_dma ?: atomic_pool_kernel;
-		return atomic_pool_kernel;
+		return atomic_pool_kernel ?: atomic_pool_dma32 ?: atomic_pool_dma;
 	}
 	if (prev == atomic_pool_kernel)
 		return atomic_pool_dma32 ? atomic_pool_dma32 : atomic_pool_dma;
-- 
cgit v1.2.3


From 90f3c123247e9564f2ecf861946ec41ceaf5e198 Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Tue, 6 Jan 2026 18:33:21 +0200
Subject: panic: only warn about deprecated panic_print on write access

The panic_print_deprecated() warning is being triggered on both read and
write operations to the panic_print parameter.

This causes spurious warnings when users run 'sysctl -a' to list all
sysctl values, since that command reads /proc/sys/kernel/panic_print and
triggers the deprecation notice.

Modify the handlers to only emit the deprecation warning when the
parameter is actually being set:

 - sysctl_panic_print_handler(): check 'write' flag before warning.
 - panic_print_get(): remove the deprecation call entirely.

This way, users are only warned when they actively try to use the
deprecated parameter, not when passively querying system state.

Link: https://lkml.kernel.org/r/20260106163321.83586-1-gal@nvidia.com
Fixes: ee13240cd78b ("panic: add note that panic_print sysctl interface is deprecated")
Fixes: 2683df6539cb ("panic: add note that 'panic_print' parameter is deprecated")
Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Nimrod Oren <noren@nvidia.com>
Cc: Feng Tang <feng.tang@linux.alibaba.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/panic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 0d52210a9e2b..0c20fcaae98a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -131,7 +131,8 @@ static int proc_taint(const struct ctl_table *table, int write,
 static int sysctl_panic_print_handler(const struct ctl_table *table, int write,
 			   void *buffer, size_t *lenp, loff_t *ppos)
 {
-	panic_print_deprecated();
+	if (write)
+		panic_print_deprecated();
 	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 
@@ -1014,7 +1015,6 @@ static int panic_print_set(const char *val, const struct kernel_param *kp)
 
 static int panic_print_get(char *val, const struct kernel_param *kp)
 {
-	panic_print_deprecated();
 	return  param_get_ulong(val, kp);
 }
 
-- 
cgit v1.2.3


From e806f7dde8ba28bc72a7a0898589cac79f6362ac Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 20 Jan 2026 07:55:55 +0100
Subject: timekeeping: Adjust the leap state for the correct auxiliary
 timekeeper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When __do_ajdtimex() was introduced to handle adjtimex for any
timekeeper, this reference to tk_core was not updated. When called on an
auxiliary timekeeper, the core timekeeper would be updated incorrectly.

This gets caught by the lock debugging diagnostics because the
timekeepers sequence lock gets written to without holding its
associated spinlock:

WARNING: include/linux/seqlock.h:226 at __do_adjtimex+0x394/0x3b0, CPU#2: test/125
aux_clock_adj (kernel/time/timekeeping.c:2979)
__do_sys_clock_adjtime (kernel/time/posix-timers.c:1161 kernel/time/posix-timers.c:1173)
do_syscall_64 (arch/x86/entry/syscall_64.c:63 (discriminator 1) arch/x86/entry/syscall_64.c:94 (discriminator 1))
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:131)

Update the correct auxiliary timekeeper.

Fixes: 775f71ebedd3 ("timekeeping: Make do_adjtimex() reusable")
Fixes: ecf3e7030491 ("timekeeping: Provide adjtimex() for auxiliary clocks")
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Cc: stable@vger.kernel.org
Link: https://patch.msgid.link/20260120-timekeeper-auxclock-leapstate-v1-1-5b358c6b3cfd@linutronix.de
---
 kernel/time/timekeeping.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3ec3daa4acab..91fa2003351c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2735,7 +2735,7 @@ static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc,
 		timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET);
 		result->clock_set = true;
 	} else {
-		tk_update_leap_state_all(&tk_core);
+		tk_update_leap_state_all(tkd);
 	}
 
 	/* Update the multiplier immediately if frequency was set directly */
-- 
cgit v1.2.3


From c06343be0b4e03fe319910dd7a5d5b9929e1c0cb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 17 Dec 2025 18:21:05 +0100
Subject: clocksource: Reduce watchdog readout delay limit to prevent false
 positives

The "valid" readout delay between the two reads of the watchdog is larger
than the valid delta between the resulting watchdog and clocksource
intervals, which results in false positive watchdog results.

Assume TSC is the clocksource and HPET is the watchdog and both have a
uncertainty margin of 250us (default). The watchdog readout does:

  1) wdnow = read(HPET);
  2) csnow = read(TSC);
  3) wdend = read(HPET);

The valid window for the delta between #1 and #3 is calculated by the
uncertainty margins of the watchdog and the clocksource:

   m = 2 * watchdog.uncertainty_margin + cs.uncertainty margin;

which results in 750us for the TSC/HPET case.

The actual interval comparison uses a smaller margin:

   m = watchdog.uncertainty_margin + cs.uncertainty margin;

which results in 500us for the TSC/HPET case.

That means the following scenario will trigger the watchdog:

 Watchdog cycle N:

 1)       wdnow[N] = read(HPET);
 2)       csnow[N] = read(TSC);
 3)       wdend[N] = read(HPET);

Assume the delay between #1 and #2 is 100us and the delay between #1 and

 Watchdog cycle N + 1:

 4)       wdnow[N + 1] = read(HPET);
 5)       csnow[N + 1] = read(TSC);
 6)       wdend[N + 1] = read(HPET);

If the delay between #4 and #6 is within the 750us margin then any delay
between #4 and #5 which is larger than 600us will fail the interval check
and mark the TSC unstable because the intervals are calculated against the
previous value:

    wd_int = wdnow[N + 1] - wdnow[N];
    cs_int = csnow[N + 1] - csnow[N];

Putting the above delays in place this results in:

    cs_int = (wdnow[N + 1] + 610us) - (wdnow[N] + 100us);
 -> cs_int = wd_int + 510us;

which is obviously larger than the allowed 500us margin and results in
marking TSC unstable.

Fix this by using the same margin as the interval comparison. If the delay
between two watchdog reads is larger than that, then the readout was either
disturbed by interconnect congestion, NMIs or SMIs.

Fixes: 4ac1dd3245b9 ("clocksource: Set cs_watchdog_read() checks based on .uncertainty_margin")
Reported-by: Daniel J Blueman <daniel@quora.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/lkml/20250602223251.496591-1-daniel@quora.org/
Link: https://patch.msgid.link/87bjjxc9dq.ffs@tglx
---
 kernel/time/clocksource.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index a1890a073196..df7194961658 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -252,7 +252,7 @@ enum wd_read_status {
 
 static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
 {
-	int64_t md = 2 * watchdog->uncertainty_margin;
+	int64_t md = watchdog->uncertainty_margin;
 	unsigned int nretries, max_retries;
 	int64_t wd_delay, wd_seq_delay;
 	u64 wd_end, wd_end2;
-- 
cgit v1.2.3


From d06bf78e55d5159c1b00072e606ab924ffbbad35 Mon Sep 17 00:00:00 2001
From: Will Rosenberg <whrosenb@asu.edu>
Date: Mon, 19 Jan 2026 11:49:56 -0700
Subject: perf: Fix refcount warning on event->mmap_count increment

When calling refcount_inc(&event->mmap_count) inside perf_mmap_rb(), the
following warning is triggered:

        refcount_t: addition on 0; use-after-free.
        WARNING: lib/refcount.c:25

PoC:

    struct perf_event_attr attr = {0};
    int fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
    mmap(NULL, 0x3000, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    int victim = syscall(__NR_perf_event_open, &attr, 0, -1, fd,
                         PERF_FLAG_FD_OUTPUT);
    mmap(NULL, 0x3000, PROT_READ | PROT_WRITE, MAP_SHARED, victim, 0);

This occurs when creating a group member event with the flag
PERF_FLAG_FD_OUTPUT. The group leader should be mmap-ed and then mmap-ing
the event triggers the warning.

Since the event has copied the output_event in perf_event_set_output(),
event->rb is set. As a result, perf_mmap_rb() calls
refcount_inc(&event->mmap_count) when event->mmap_count = 0.

Disallow the case when event->mmap_count = 0. This also prevents two
events from updating the same user_page.

Fixes: 448f97fba901 ("perf: Convert mmap() refcounts to refcount_t")
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Will Rosenberg <whrosenb@asu.edu>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260119184956.801238-1-whrosenb@asu.edu
---
 kernel/events/core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b5cb620499e..a0fa488bce84 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6997,6 +6997,15 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
 		if (data_page_nr(event->rb) != nr_pages)
 			return -EINVAL;
 
+		/*
+		 * If this event doesn't have mmap_count, we're attempting to
+		 * create an alias of another event's mmap(); this would mean
+		 * both events will end up scribbling the same user_page;
+		 * which makes no sense.
+		 */
+		if (!refcount_read(&event->mmap_count))
+			return -EBUSY;
+
 		if (refcount_inc_not_zero(&event->rb->mmap_count)) {
 			/*
 			 * Success -- managed to mmap() the same buffer
-- 
cgit v1.2.3


From 98c88dc8a1ace642d9021b103b28cba7b51e3abc Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 21 Jan 2026 17:33:17 +0100
Subject: sched/fair: Fix pelt clock sync when entering idle

Samuel and Alex reported regressions of the util_avg of RT rq with
commit 17e3e88ed0b6 ("sched/fair: Fix pelt lost idle time detection").
It happens that fair is updating and syncing the pelt clock with task one
when pick_next_task_fair() fails to pick a task but before the prev
scheduling class got a chance to update its pelt signals.

Move update_idle_rq_clock_pelt() in set_next_task_idle() which is called
after prev class has been called.

Fixes: 17e3e88ed0b6 ("sched/fair: Fix pelt lost idle time detection")
Closes: https://lore.kernel.org/all/CAG2KctpO6VKS6GN4QWDji0t92_gNBJ7HjjXrE+6H+RwRXt=iLg@mail.gmail.com/
Closes: https://lore.kernel.org/all/8cf19bf0e0054dcfed70e9935029201694f1bb5a.camel@mediatek.com/
Reported-by: Samuel Wu <wusamuel@google.com>
Reported-by: Alex Hoh <Alex.Hoh@mediatek.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Samuel Wu <wusamuel@google.com>
Tested-by: Alex Hoh <Alex.Hoh@mediatek.com>
Link: https://patch.msgid.link/20260121163317.505635-1-vincent.guittot@linaro.org
---
 kernel/sched/fair.c | 6 ------
 kernel/sched/idle.c | 6 ++++++
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e71302282671..a148c61a8085 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8995,12 +8995,6 @@ idle:
 			goto again;
 	}
 
-	/*
-	 * rq is about to be idle, check if we need to update the
-	 * lost_idle_time of clock_pelt
-	 */
-	update_idle_rq_clock_pelt(rq);
-
 	return NULL;
 }
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c174afe1dd17..abf8f15d60c9 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -468,6 +468,12 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
 	scx_update_idle(rq, true, true);
 	schedstat_inc(rq->sched_goidle);
 	next->se.exec_start = rq_clock_task(rq);
+
+	/*
+	 * rq is about to be idle, check if we need to update the
+	 * lost_idle_time of clock_pelt
+	 */
+	update_idle_rq_clock_pelt(rq);
 }
 
 struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf)
-- 
cgit v1.2.3


From 4f70f106bca1a56bd66d00830ac91680bd754974 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 20 Jan 2026 11:33:35 +0000
Subject: sched/fair: Disable scheduler feature NEXT_BUDDY

NEXT_BUDDY was disabled with the introduction of EEVDF and enabled again
after NEXT_BUDDY was rewritten for EEVDF by commit e837456fdca8 ("sched/fair:
Reimplement NEXT_BUDDY to align with EEVDF goals"). It was not expected
that this would be a universal win without a crystal ball instruction
but the reported regressions are a concern [1][2] even if gains were
also reported. Specifically;

o mysql with client/server running on different servers regresses
o specjbb reports lower peak metrics
o daytrader regresses

The mysql is realistic and a concern. It needs to be confirmed if
specjbb is simply shifting the point where peak performance is measured
but still a concern. daytrader is considered to be representative of a
real workload.

Access to test machines is currently problematic for verifying any fix to
this problem. Disable NEXT_BUDDY for now by default until the root causes
are addressed.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Link: https://lore.kernel.org/lkml/4b96909a-f1ac-49eb-b814-97b8adda6229@arm.com [1]
Link: https://lore.kernel.org/lkml/ec3ea66f-3a0d-4b5a-ab36-ce778f159b5b@linux.ibm.com [2]
Link: https://patch.msgid.link/fyqsk63pkoxpeaclyqsm5nwtz3dyejplr7rg6p74xwemfzdzuu@7m7xhs5aqpqw
---
 kernel/sched/features.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 980d92bab8ab..136a6584be79 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -29,7 +29,7 @@ SCHED_FEAT(PREEMPT_SHORT, true)
  * wakeup-preemption), since its likely going to consume data we
  * touched, increases cache locality.
  */
-SCHED_FEAT(NEXT_BUDDY, true)
+SCHED_FEAT(NEXT_BUDDY, false)
 
 /*
  * Allow completely ignoring cfs_rq->next; which can be set from various
-- 
cgit v1.2.3


From 15257cc2f905dbf5813c0bfdd3c15885f28093c4 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 23 Jan 2026 11:28:58 +0100
Subject: sched/fair: Revert force wakeup preemption

This agressively bypasses run_to_parity and slice protection with the
assumpiton that this is what waker wants but there is no garantee that
the wakee will be the next to run. It is a better choice to use
yield_to_task or WF_SYNC in such case.

This increases the number of resched and preemption because a task becomes
quickly "ineligible" when it runs; We update the task vruntime periodically
and before the task exhausted its slice or at least quantum.

Example:
2 tasks A and B wake up simultaneously with lag = 0. Both are
eligible. Task A runs 1st and wakes up task C. Scheduler updates task
A's vruntime which becomes greater than average runtime as all others
have a lag == 0 and didn't run yet. Now task A is ineligible because
it received more runtime than the other task but it has not yet
exhausted its slice nor a min quantum. We force preemption, disable
protection but Task B will run 1st not task C.

Sidenote, DELAY_ZERO increases this effect by clearing positive lag at
wake up.

Fixes: e837456fdca8 ("sched/fair: Reimplement NEXT_BUDDY to align with EEVDF goals")
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260123102858.52428-1-vincent.guittot@linaro.org
---
 kernel/sched/fair.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a148c61a8085..3eaeceda71b0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8828,16 +8828,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 	if ((wake_flags & WF_FORK) || pse->sched_delayed)
 		return;
 
-	/*
-	 * If @p potentially is completing work required by current then
-	 * consider preemption.
-	 *
-	 * Reschedule if waker is no longer eligible. */
-	if (in_task() && !entity_eligible(cfs_rq, se)) {
-		preempt_action = PREEMPT_WAKEUP_RESCHED;
-		goto preempt;
-	}
-
 	/* Prefer picking wakee soon if appropriate. */
 	if (sched_feat(NEXT_BUDDY) &&
 	    set_preempt_buddy(cfs_rq, wake_flags, pse, se)) {
-- 
cgit v1.2.3


From 90f9f5d64cae4e72defd96a2a22760173cb3c9ec Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 22 Jan 2026 19:48:24 -0500
Subject: tracing: Fix crash on synthetic stacktrace field usage

When creating a synthetic event based on an existing synthetic event that
had a stacktrace field and the new synthetic event used that field a
kernel crash occurred:

 ~# cd /sys/kernel/tracing
 ~# echo 's:stack unsigned long stack[];' > dynamic_events
 ~# echo 'hist:keys=prev_pid:s0=common_stacktrace if prev_state & 3' >> events/sched/sched_switch/trigger
 ~# echo 'hist:keys=next_pid:s1=$s0:onmatch(sched.sched_switch).trace(stack,$s1)' >> events/sched/sched_switch/trigger

The above creates a synthetic event that takes a stacktrace when a task
schedules out in a non-running state and passes that stacktrace to the
sched_switch event when that task schedules back in. It triggers the
"stack" synthetic event that has a stacktrace as its field (called "stack").

 ~# echo 's:syscall_stack s64 id; unsigned long stack[];' >> dynamic_events
 ~# echo 'hist:keys=common_pid:s2=stack' >> events/synthetic/stack/trigger
 ~# echo 'hist:keys=common_pid:s3=$s2,i0=id:onmatch(synthetic.stack).trace(syscall_stack,$i0,$s3)' >> events/raw_syscalls/sys_exit/trigger

The above makes another synthetic event called "syscall_stack" that
attaches the first synthetic event (stack) to the sys_exit trace event and
records the stacktrace from the stack event with the id of the system call
that is exiting.

When enabling this event (or using it in a historgram):

 ~# echo 1 > events/synthetic/syscall_stack/enable

Produces a kernel crash!

 BUG: unable to handle page fault for address: 0000000000400010
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: Oops: 0000 [#1] SMP PTI
 CPU: 6 UID: 0 PID: 1257 Comm: bash Not tainted 6.16.3+deb14-amd64 #1 PREEMPT(lazy)  Debian 6.16.3-1
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
 RIP: 0010:trace_event_raw_event_synth+0x90/0x380
 Code: c5 00 00 00 00 85 d2 0f 84 e1 00 00 00 31 db eb 34 0f 1f 00 66 66 2e 0f 1f 84 00 00 00 00 00 66 66 2e 0f 1f 84 00 00 00 00 00 <49> 8b 04 24 48 83 c3 01 8d 0c c5 08 00 00 00 01 cd 41 3b 5d 40 0f
 RSP: 0018:ffffd2670388f958 EFLAGS: 00010202
 RAX: ffff8ba1065cc100 RBX: 0000000000000000 RCX: 0000000000000000
 RDX: 0000000000000001 RSI: fffff266ffda7b90 RDI: ffffd2670388f9b0
 RBP: 0000000000000010 R08: ffff8ba104e76000 R09: ffffd2670388fa50
 R10: ffff8ba102dd42e0 R11: ffffffff9a908970 R12: 0000000000400010
 R13: ffff8ba10a246400 R14: ffff8ba10a710220 R15: fffff266ffda7b90
 FS:  00007fa3bc63f740(0000) GS:ffff8ba2e0f48000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000400010 CR3: 0000000107f9e003 CR4: 0000000000172ef0
 Call Trace:
  <TASK>
  ? __tracing_map_insert+0x208/0x3a0
  action_trace+0x67/0x70
  event_hist_trigger+0x633/0x6d0
  event_triggers_call+0x82/0x130
  trace_event_buffer_commit+0x19d/0x250
  trace_event_raw_event_sys_exit+0x62/0xb0
  syscall_exit_work+0x9d/0x140
  do_syscall_64+0x20a/0x2f0
  ? trace_event_raw_event_sched_switch+0x12b/0x170
  ? save_fpregs_to_fpstate+0x3e/0x90
  ? _raw_spin_unlock+0xe/0x30
  ? finish_task_switch.isra.0+0x97/0x2c0
  ? __rseq_handle_notify_resume+0xad/0x4c0
  ? __schedule+0x4b8/0xd00
  ? restore_fpregs_from_fpstate+0x3c/0x90
  ? switch_fpu_return+0x5b/0xe0
  ? do_syscall_64+0x1ef/0x2f0
  ? do_fault+0x2e9/0x540
  ? __handle_mm_fault+0x7d1/0xf70
  ? count_memcg_events+0x167/0x1d0
  ? handle_mm_fault+0x1d7/0x2e0
  ? do_user_addr_fault+0x2c3/0x7f0
  entry_SYSCALL_64_after_hwframe+0x76/0x7e

The reason is that the stacktrace field is not labeled as such, and is
treated as a normal field and not as a dynamic event that it is.

In trace_event_raw_event_synth() the event is field is still treated as a
dynamic array, but the retrieval of the data is considered a normal field,
and the reference is just the meta data:

// Meta data is retrieved instead of a dynamic array
  str_val = (char *)(long)var_ref_vals[val_idx];

// Then when it tries to process it:
  len = *((unsigned long *)str_val) + 1;

It triggers a kernel page fault.

To fix this, first when defining the fields of the first synthetic event,
set the filter type to FILTER_STACKTRACE. This is used later by the second
synthetic event to know that this field is a stacktrace. When creating
the field of the new synthetic event, have it use this FILTER_STACKTRACE
to know to create a stacktrace field to copy the stacktrace into.

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Link: https://patch.msgid.link/20260122194824.6905a38e@gandalf.local.home
Fixes: 00cf3d672a9d ("tracing: Allow synthetic events to pass around stacktraces")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c  | 9 +++++++++
 kernel/trace/trace_events_synth.c | 8 +++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 5e6e70540eef..c97bb2fda5c0 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -2057,6 +2057,15 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
 			hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING;
 		else
 			hist_field->fn_num = HIST_FIELD_FN_PSTRING;
+	} else if (field->filter_type == FILTER_STACKTRACE) {
+		flags |= HIST_FIELD_FL_STACKTRACE;
+
+		hist_field->size = MAX_FILTER_STR_VAL;
+		hist_field->type = kstrdup_const(field->type, GFP_KERNEL);
+		if (!hist_field->type)
+			goto free;
+
+		hist_field->fn_num = HIST_FIELD_FN_STACK;
 	} else {
 		hist_field->size = field->size;
 		hist_field->is_signed = field->is_signed;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 4554c458b78c..45c187e77e21 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -130,7 +130,9 @@ static int synth_event_define_fields(struct trace_event_call *call)
 	struct synth_event *event = call->data;
 	unsigned int i, size, n_u64;
 	char *name, *type;
+	int filter_type;
 	bool is_signed;
+	bool is_stack;
 	int ret = 0;
 
 	for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
@@ -138,8 +140,12 @@ static int synth_event_define_fields(struct trace_event_call *call)
 		is_signed = event->fields[i]->is_signed;
 		type = event->fields[i]->type;
 		name = event->fields[i]->name;
+		is_stack = event->fields[i]->is_stack;
+
+		filter_type = is_stack ? FILTER_STACKTRACE : FILTER_OTHER;
+
 		ret = trace_define_field(call, type, name, offset, size,
-					 is_signed, FILTER_OTHER);
+					 is_signed, filter_type);
 		if (ret)
 			break;
 
-- 
cgit v1.2.3


From 00f13e28a9c3acd40f0551cde7e9d2d1a41585bf Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 7 Jan 2026 16:26:25 -0800
Subject: tracing: Avoid possible signed 64-bit truncation

64-bit truncation to 32-bit can result in the sign of the truncated
value changing. The cmp_mod_entry is used in bsearch and so the
truncation could result in an invalid search order. This would only
happen were the addresses more than 2GB apart and so unlikely, but
let's fix the potentially broken compare anyway.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20260108002625.333331-1-irogers@google.com
Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index baec63134ab6..8bd4ec08fb36 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6115,10 +6115,10 @@ static int cmp_mod_entry(const void *key, const void *pivot)
 	unsigned long addr = (unsigned long)key;
 	const struct trace_mod_entry *ent = pivot;
 
-	if (addr >= ent[0].mod_addr && addr < ent[1].mod_addr)
-		return 0;
-	else
-		return addr - ent->mod_addr;
+	if (addr < ent[0].mod_addr)
+		return -1;
+
+	return addr >= ent[1].mod_addr;
 }
 
 /**
-- 
cgit v1.2.3


From c9703d17d2c86eda38fe4917ca70c27ec9dbe162 Mon Sep 17 00:00:00 2001
From: Donglin Peng <pengdonglin@xiaomi.com>
Date: Mon, 12 Jan 2026 10:16:01 +0800
Subject: function_graph: Fix args pointer mismatch in print_graph_retval()

When funcgraph-args and funcgraph-retaddr are both enabled, many kernel
functions display invalid parameters in trace logs.

The issue occurs because print_graph_retval() passes a mismatched args
pointer to print_function_args(). Fix this by retrieving the correct
args pointer using the FGRAPH_ENTRY_ARGS() macro.

Link: https://patch.msgid.link/20260112021601.1300479-1-dolinux.peng@gmail.com
Fixes: f83ac7544fbf ("function_graph: Enable funcgraph-args and funcgraph-retaddr to work simultaneously")
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b1e9c9913309..1de6f1573621 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -901,7 +901,7 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr
 		trace_seq_printf(s, "%ps", func);
 
 		if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) {
-			print_function_args(s, entry->args, (unsigned long)func);
+			print_function_args(s, FGRAPH_ENTRY_ARGS(entry), (unsigned long)func);
 			trace_seq_putc(s, ';');
 		} else
 			trace_seq_puts(s, "();");
-- 
cgit v1.2.3