From 65f97cc81b0adc5f49cf6cff5d874be0058e3f41 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 6 Aug 2025 13:24:28 -0400 Subject: cgroup/cpuset: Use static_branch_enable_cpuslocked() on cpusets_insane_config_key The following lockdep splat was observed. [ 812.359086] ============================================ [ 812.359089] WARNING: possible recursive locking detected [ 812.359097] -------------------------------------------- [ 812.359100] runtest.sh/30042 is trying to acquire lock: [ 812.359105] ffffffffa7f27420 (cpu_hotplug_lock){++++}-{0:0}, at: static_key_enable+0xe/0x20 [ 812.359131] [ 812.359131] but task is already holding lock: [ 812.359134] ffffffffa7f27420 (cpu_hotplug_lock){++++}-{0:0}, at: cpuset_write_resmask+0x98/0xa70 : [ 812.359267] Call Trace: [ 812.359272] [ 812.359367] cpus_read_lock+0x3c/0xe0 [ 812.359382] static_key_enable+0xe/0x20 [ 812.359389] check_insane_mems_config.part.0+0x11/0x30 [ 812.359398] cpuset_write_resmask+0x9f2/0xa70 [ 812.359411] cgroup_file_write+0x1c7/0x660 [ 812.359467] kernfs_fop_write_iter+0x358/0x530 [ 812.359479] vfs_write+0xabe/0x1250 [ 812.359529] ksys_write+0xf9/0x1d0 [ 812.359558] do_syscall_64+0x5f/0xe0 Since commit d74b27d63a8b ("cgroup/cpuset: Change cpuset_rwsem and hotplug lock order"), the ordering of cpu hotplug lock and cpuset_mutex had been reversed. That patch correctly used the cpuslocked version of the static branch API to enable cpusets_pre_enable_key and cpusets_enabled_key, but it didn't do the same for cpusets_insane_config_key. The cpusets_insane_config_key can be enabled in the check_insane_mems_config() which is called from update_nodemask() or cpuset_hotplug_update_tasks() with both cpu hotplug lock and cpuset_mutex held. Deadlock can happen with a pending hotplug event that tries to acquire the cpu hotplug write lock which will block further cpus_read_lock() attempt from check_insane_mems_config(). Fix that by switching to use static_branch_enable_cpuslocked(). Fixes: d74b27d63a8b ("cgroup/cpuset: Change cpuset_rwsem and hotplug lock order") Signed-off-by: Waiman Long Reviewed-by: Juri Lelli Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f74d04429a29..bf149246e001 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -280,7 +280,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes) { if (!cpusets_insane_config() && movable_only_nodes(nodes)) { - static_branch_enable(&cpusets_insane_config_key); + static_branch_enable_cpuslocked(&cpusets_insane_config_key); pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" "Cpuset allocations might fail even with a lot of memory available.\n", nodemask_pr_args(nodes)); -- cgit v1.2.3 From 150e298ae0ccbecff2357a72fbabd80f8849ea6e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 6 Aug 2025 13:24:29 -0400 Subject: cgroup/cpuset: Fix a partition error with CPU hotplug It was found during testing that an invalid leaf partition with an empty effective exclusive CPU list can become a valid empty partition with no CPU afer an offline/online operation of an unrelated CPU. An empty partition root is allowed in the special case that it has no task in its cgroup and has distributed out all its CPUs to its child partitions. That is certainly not the case here. The problem is in the cpumask_subsets() test in the hotplug case (update with no new mask) of update_parent_effective_cpumask() as it also returns true if the effective exclusive CPU list is empty. Fix that by addding the cpumask_empty() test to root out this exception case. Also add the cpumask_empty() test in cpuset_hotplug_update_tasks() to avoid calling update_parent_effective_cpumask() for this special case. Fixes: 0c7f293efc87 ("cgroup/cpuset: Add cpuset.cpus.exclusive.effective for v2") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index bf149246e001..d993e058a663 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1843,7 +1843,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (is_partition_valid(cs)) adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); - } else if (is_partition_invalid(cs) && + } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) && cpumask_subset(xcpus, parent->effective_xcpus)) { struct cgroup_subsys_state *css; struct cpuset *child; @@ -3870,9 +3870,10 @@ retry: partcmd = partcmd_invalidate; /* * On the other hand, an invalid partition root may be transitioned - * back to a regular one. + * back to a regular one with a non-empty effective xcpus. */ - else if (is_partition_valid(parent) && is_partition_invalid(cs)) + else if (is_partition_valid(parent) && is_partition_invalid(cs) && + !cpumask_empty(cs->effective_xcpus)) partcmd = partcmd_update; if (partcmd >= 0) { -- cgit v1.2.3 From 87eba5bc5ab1d99e31c9d3b2c386187da94a5ab1 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 6 Aug 2025 13:24:30 -0400 Subject: cgroup/cpuset: Remove the unnecessary css_get/put() in cpuset_partition_write() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The css_get/put() calls in cpuset_partition_write() are unnecessary as an active reference of the kernfs node will be taken which will prevent its removal and guarantee the existence of the css. Only the online check is needed. Signed-off-by: Waiman Long Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index d993e058a663..27adb04df675 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3358,14 +3358,12 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, else return -EINVAL; - css_get(&cs->css); cpus_read_lock(); mutex_lock(&cpuset_mutex); if (is_cpuset_online(cs)) retval = update_prstate(cs, val); mutex_unlock(&cpuset_mutex); cpus_read_unlock(); - css_put(&cs->css); return retval ?: nbytes; } -- cgit v1.2.3 From eea51c6e3f6675b795f6439eaa960eb2948d6905 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 6 Aug 2025 17:33:50 -0700 Subject: cgroup: avoid null de-ref in css_rstat_exit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit css_rstat_exit() may be called asynchronously in scenarios where preceding calls to css_rstat_init() have not completed. One such example is this sequence below: css_create(...) { ... init_and_link_css(css, ...); err = percpu_ref_init(...); if (err) goto err_free_css; err = cgroup_idr_alloc(...); if (err) goto err_free_css; err = css_rstat_init(css, ...); if (err) goto err_free_css; ... err_free_css: INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); return ERR_PTR(err); } If any of the three goto jumps are taken, async cleanup will begin and css_rstat_exit() will be invoked on an uninitialized css->rstat_cpu. Avoid accessing the unitialized field by returning early in css_rstat_exit() if this is the case. Signed-off-by: JP Kobryn Suggested-by: Michal Koutný Fixes: 5da3bfa029d68 ("cgroup: use separate rstat trees for each subsystem") Cc: stable@vger.kernel.org # v6.16 Reported-by: syzbot+8d052e8b99e40bc625ed@syzkaller.appspotmail.com Acked-by: Shakeel Butt Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 981e2f77ad4e..a198e40c799b 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -479,6 +479,9 @@ void css_rstat_exit(struct cgroup_subsys_state *css) if (!css_uses_rstat(css)) return; + if (!css->rstat_cpu) + return; + css_rstat_flush(css); /* sanity check */ -- cgit v1.2.3 From 61399e0c5410567ef60cb1cda34cca42903842e3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Aug 2025 19:03:22 +0200 Subject: rcu: Fix racy re-initialization of irq_work causing hangs RCU re-initializes the deferred QS irq work everytime before attempting to queue it. However there are situations where the irq work is attempted to be queued even though it is already queued. In that case re-initializing messes-up with the irq work queue that is about to be handled. The chances for that to happen are higher when the architecture doesn't support self-IPIs and irq work are then all lazy, such as with the following sequence: 1) rcu_read_unlock() is called when IRQs are disabled and there is a grace period involving blocked tasks on the node. The irq work is then initialized and queued. 2) The related tasks are unblocked and the CPU quiescent state is reported. rdp->defer_qs_iw_pending is reset to DEFER_QS_IDLE, allowing the irq work to be requeued in the future (note the previous one hasn't fired yet). 3) A new grace period starts and the node has blocked tasks. 4) rcu_read_unlock() is called when IRQs are disabled again. The irq work is re-initialized (but it's queued! and its node is cleared) and requeued. Which means it's requeued to itself. 5) The irq work finally fires with the tick. But since it was requeued to itself, it loops and hangs. Fix this with initializing the irq work only once before the CPU boots. Fixes: b41642c87716 ("rcu: Fix rcu_read_unlock() deadloop due to IRQ work") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202508071303.c1134cce-lkp@intel.com Signed-off-by: Frederic Weisbecker Reviewed-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 2 ++ kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 8 ++++++-- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 174ee243b349..8eff357b0436 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4262,6 +4262,8 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + rcu_preempt_deferred_qs_init(rdp); rcu_spawn_rnp_kthreads(rnp); rcu_spawn_cpu_nocb_kthread(cpu); ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index de6ca13a7b5f..b8bbe7960cda 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -488,6 +488,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_flavor_sched_clock_irq(int user); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static bool rcu_is_callbacks_kthread(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fc14adf15cbb..4cd170b2d655 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -763,8 +763,6 @@ static void rcu_read_unlock_special(struct task_struct *t) cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - rdp->defer_qs_iw = - IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler); rdp->defer_qs_iw_pending = DEFER_QS_PENDING; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } @@ -904,6 +902,10 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) } } +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) +{ + rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler); +} #else /* #ifdef CONFIG_PREEMPT_RCU */ /* @@ -1103,6 +1105,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); } +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { } + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* -- cgit v1.2.3 From dfb36e4a8db0cd56f92d4cb445f54e85a9b40897 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 11 Aug 2025 10:11:47 -0400 Subject: futex: Use user_write_access_begin/_end() in futex_put_value() Commit cec199c5e39b ("futex: Implement FUTEX2_NUMA") introduced the futex_put_value() helper to write a value to the given user address. However, it uses user_read_access_begin() before the write. For architectures that differentiate between read and write accesses, like PowerPC, futex_put_value() fails with -EFAULT. Fix that by using the user_write_access_begin/user_write_access_end() pair instead. Fixes: cec199c5e39b ("futex: Implement FUTEX2_NUMA") Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20250811141147.322261-1-longman@redhat.com --- kernel/futex/futex.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index c74eac572acd..2cd57096c38e 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -319,13 +319,13 @@ static __always_inline int futex_put_value(u32 val, u32 __user *to) { if (can_do_masked_user_access()) to = masked_user_access_begin(to); - else if (!user_read_access_begin(to, sizeof(*to))) + else if (!user_write_access_begin(to, sizeof(*to))) return -EFAULT; unsafe_put_user(val, to, Efault); - user_read_access_end(); + user_write_access_end(); return 0; Efault: - user_read_access_end(); + user_write_access_end(); return -EFAULT; } -- cgit v1.2.3 From ddf7233fcab6c247379d0928d46cc316ee122229 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 5 Aug 2025 10:59:11 +0200 Subject: sched/ext: Fix invalid task state transitions on class switch When enabling a sched_ext scheduler, we may trigger invalid task state transitions, resulting in warnings like the following (which can be easily reproduced by running the hotplug selftest in a loop): sched_ext: Invalid task state transition 0 -> 3 for fish[770] WARNING: CPU: 18 PID: 787 at kernel/sched/ext.c:3862 scx_set_task_state+0x7c/0xc0 ... RIP: 0010:scx_set_task_state+0x7c/0xc0 ... Call Trace: scx_enable_task+0x11f/0x2e0 switching_to_scx+0x24/0x110 scx_enable.isra.0+0xd14/0x13d0 bpf_struct_ops_link_create+0x136/0x1a0 __sys_bpf+0x1edd/0x2c30 __x64_sys_bpf+0x21/0x30 do_syscall_64+0xbb/0x370 entry_SYSCALL_64_after_hwframe+0x77/0x7f This happens because we skip initialization for tasks that are already dead (with their usage counter set to zero), but we don't exclude them during the scheduling class transition phase. Fix this by also skipping dead tasks during class swiching, preventing invalid task state transitions. Fixes: a8532fac7b5d2 ("sched_ext: TASK_DEAD tasks must be switched into SCX on ops_enable") Cc: stable@vger.kernel.org # v6.12+ Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7dedc9a16281..4ae32ef179dd 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5749,6 +5749,9 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) __setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx; + if (!tryget_task_struct(p)) + continue; + if (old_class != new_class && p->se.sched_delayed) dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); @@ -5761,6 +5764,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) sched_enq_and_set_task(&ctx); check_class_changed(task_rq(p), p, old_class, p->prio); + put_task_struct(p); } scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); -- cgit v1.2.3 From c0a23bbc98e93704a1f4fb5e7e7bb2d7c0fb6eb3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Jul 2025 14:26:11 +0200 Subject: ipvs: Fix estimator kthreads preferred affinity The estimator kthreads' affinity are defined by sysctl overwritten preferences and applied through a plain call to the scheduler's affinity API. However since the introduction of managed kthreads preferred affinity, such a practice shortcuts the kthreads core code which eventually overwrites the target to the default unbound affinity. Fix this with using the appropriate kthread's API. Fixes: d1a89197589c ("kthread: Default affine kthread to its preferred NUMA node") Signed-off-by: Frederic Weisbecker Acked-by: Julian Anastasov Signed-off-by: Florian Westphal --- include/net/ip_vs.h | 13 +++++++++++++ kernel/kthread.c | 1 + net/netfilter/ipvs/ip_vs_est.c | 3 ++- 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ff406ef4fd4a..29a36709e7f3 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1163,6 +1163,14 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) return housekeeping_cpumask(HK_TYPE_KTHREAD); } +static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs) +{ + if (ipvs->est_cpulist_valid) + return ipvs->sysctl_est_cpulist; + else + return NULL; +} + static inline int sysctl_est_nice(struct netns_ipvs *ipvs) { return ipvs->sysctl_est_nice; @@ -1270,6 +1278,11 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) return housekeeping_cpumask(HK_TYPE_KTHREAD); } +static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs) +{ + return NULL; +} + static inline int sysctl_est_nice(struct netns_ipvs *ipvs) { return IPVS_EST_NICE; diff --git a/kernel/kthread.c b/kernel/kthread.c index 0e98b228a8ef..31b072e8d427 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -893,6 +893,7 @@ out: return ret; } +EXPORT_SYMBOL_GPL(kthread_affine_preferred); /* * Re-affine kthreads according to their preferences diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index f821ad2e19b3..15049b826732 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -265,7 +265,8 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, } set_user_nice(kd->task, sysctl_est_nice(ipvs)); - set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs)); + if (sysctl_est_preferred_cpulist(ipvs)) + kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs)); pr_info("starting estimator thread %d...\n", kd->id); wake_up_process(kd->task); -- cgit v1.2.3 From 21924af67d69d7c9fdaf845be69043cfe75196a1 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 5 Aug 2025 00:10:02 +0000 Subject: locking: Fix __clear_task_blocked_on() warning from __ww_mutex_wound() path The __clear_task_blocked_on() helper added a number of sanity checks ensuring we hold the mutex wait lock and that the task we are clearing blocked_on pointer (if set) matches the mutex. However, there is an edge case in the _ww_mutex_wound() logic where we need to clear the blocked_on pointer for the task that owns the mutex, not the task that is waiting on the mutex. For this case the sanity checks aren't valid, so handle this by allowing a NULL lock to skip the additional checks. K Prateek Nayak and Maarten Lankhorst also pointed out that in this case where we don't hold the owner's mutex wait_lock, we need to be a bit more careful using READ_ONCE/WRITE_ONCE in both the __clear_task_blocked_on() and __set_task_blocked_on() implementations to avoid accidentally tripping WARN_ONs if two instances race. So do that here as well. This issue was easier to miss, I realized, as the test-ww_mutex driver only exercises the wait-die class of ww_mutexes. I've sent a patch[1] to address this so the logic will be easier to test. [1]: https://lore.kernel.org/lkml/20250801023358.562525-2-jstultz@google.com/ Fixes: a4f0b6fef4b0 ("locking/mutex: Add p->blocked_on wrappers for correctness checks") Closes: https://lore.kernel.org/lkml/68894443.a00a0220.26d0e1.0015.GAE@google.com/ Reported-by: syzbot+602c4720aed62576cd79@syzkaller.appspotmail.com Reported-by: Maarten Lankhorst Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Acked-by: Maarten Lankhorst Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250805001026.2247040-1-jstultz@google.com --- include/linux/sched.h | 29 +++++++++++++++++------------ kernel/locking/ww_mutex.h | 6 +++++- 2 files changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 40d2fa90df42..62103dd6a48e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2144,6 +2144,8 @@ static inline struct mutex *__get_task_blocked_on(struct task_struct *p) static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) { + struct mutex *blocked_on = READ_ONCE(p->blocked_on); + WARN_ON_ONCE(!m); /* The task should only be setting itself as blocked */ WARN_ON_ONCE(p != current); @@ -2154,8 +2156,8 @@ static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) * with a different mutex. Note, setting it to the same * lock repeatedly is ok. */ - WARN_ON_ONCE(p->blocked_on && p->blocked_on != m); - p->blocked_on = m; + WARN_ON_ONCE(blocked_on && blocked_on != m); + WRITE_ONCE(p->blocked_on, m); } static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) @@ -2166,16 +2168,19 @@ static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) { - WARN_ON_ONCE(!m); - /* Currently we serialize blocked_on under the mutex::wait_lock */ - lockdep_assert_held_once(&m->wait_lock); - /* - * There may be cases where we re-clear already cleared - * blocked_on relationships, but make sure we are not - * clearing the relationship with a different lock. - */ - WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); - p->blocked_on = NULL; + if (m) { + struct mutex *blocked_on = READ_ONCE(p->blocked_on); + + /* Currently we serialize blocked_on under the mutex::wait_lock */ + lockdep_assert_held_once(&m->wait_lock); + /* + * There may be cases where we re-clear already cleared + * blocked_on relationships, but make sure we are not + * clearing the relationship with a different lock. + */ + WARN_ON_ONCE(blocked_on && blocked_on != m); + } + WRITE_ONCE(p->blocked_on, NULL); } static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 086fd5487ca7..31a785afee6c 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -342,8 +342,12 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * When waking up the task to wound, be sure to clear the * blocked_on pointer. Otherwise we can see circular * blocked_on relationships that can't resolve. + * + * NOTE: We pass NULL here instead of lock, because we + * are waking the mutex owner, who may be currently + * blocked on a different mutex. */ - __clear_task_blocked_on(owner, lock); + __clear_task_blocked_on(owner, NULL); wake_q_add(wake_q, owner); } return true; -- cgit v1.2.3 From b64fdd422a85025b5e91ead794db9d3ef970e369 Mon Sep 17 00:00:00 2001 From: Yunseong Kim Date: Tue, 12 Aug 2025 18:10:47 +0000 Subject: perf: Avoid undefined behavior from stopping/starting inactive events Calling pmu->start()/stop() on perf events in PERF_EVENT_STATE_OFF can leave event->hw.idx at -1. When PMU drivers later attempt to use this negative index as a shift exponent in bitwise operations, it leads to UBSAN shift-out-of-bounds reports. The issue is a logical flaw in how event groups handle throttling when some members are intentionally disabled. Based on the analysis and the reproducer provided by Mark Rutland (this issue on both arm64 and x86-64). The scenario unfolds as follows: 1. A group leader event is configured with a very aggressive sampling period (e.g., sample_period = 1). This causes frequent interrupts and triggers the throttling mechanism. 2. A child event in the same group is created in a disabled state (.disabled = 1). This event remains in PERF_EVENT_STATE_OFF. Since it hasn't been scheduled onto the PMU, its event->hw.idx remains initialized at -1. 3. When throttling occurs, perf_event_throttle_group() and later perf_event_unthrottle_group() iterate through all siblings, including the disabled child event. 4. perf_event_throttle()/unthrottle() are called on this inactive child event, which then call event->pmu->start()/stop(). 5. The PMU driver receives the event with hw.idx == -1 and attempts to use it as a shift exponent. e.g., in macros like PMCNTENSET(idx), leading to the UBSAN report. The throttling mechanism attempts to start/stop events that are not actively scheduled on the hardware. Move the state check into perf_event_throttle()/perf_event_unthrottle() so that inactive events are skipped entirely. This ensures only active events with a valid hw.idx are processed, preventing undefined behavior and silencing UBSAN warnings. The corrected check ensures true before proceeding with PMU operations. The problem can be reproduced with the syzkaller reproducer: Fixes: 9734e25fbf5a ("perf: Fix the throttle logic for a group") Signed-off-by: Yunseong Kim Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20250812181046.292382-2-ysk@kzalloc.com --- kernel/events/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 8060c2857bb2..872122e074e5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2665,6 +2665,9 @@ static void perf_log_itrace_start(struct perf_event *event); static void perf_event_unthrottle(struct perf_event *event, bool start) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + event->hw.interrupts = 0; if (start) event->pmu->start(event, 0); @@ -2674,6 +2677,9 @@ static void perf_event_unthrottle(struct perf_event *event, bool start) static void perf_event_throttle(struct perf_event *event) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + event->hw.interrupts = MAX_INTERRUPTS; event->pmu->stop(event, 0); if (event == event->group_leader) -- cgit v1.2.3 From 5eb4b9a4cdbb70d70377fe8fb2920b75910e5024 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 13 Aug 2025 15:21:59 +0200 Subject: params: Replace deprecated strcpy() with strscpy() and memcpy() strcpy() is deprecated; use strscpy() and memcpy() instead. In param_set_copystring(), we can safely use memcpy() because we already know the length of the source string 'val' and that it is guaranteed to be NUL-terminated within the first 'kps->maxlen' bytes. Link: https://github.com/KSPP/linux/issues/88 Signed-off-by: Thorsten Blum Reviewed-by: Daniel Gomez Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20250813132200.184064-2-thorsten.blum@linux.dev Signed-off-by: Daniel Gomez --- kernel/params.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index b92d64161b75..b96cfd693c99 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -513,13 +513,14 @@ EXPORT_SYMBOL(param_array_ops); int param_set_copystring(const char *val, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; + const size_t len = strnlen(val, kps->maxlen); - if (strnlen(val, kps->maxlen) == kps->maxlen) { + if (len == kps->maxlen) { pr_err("%s: string doesn't fit in %u chars.\n", kp->name, kps->maxlen-1); return -ENOSPC; } - strcpy(kps->string, val); + memcpy(kps->string, val, len + 1); return 0; } EXPORT_SYMBOL(param_set_copystring); @@ -841,7 +842,7 @@ static void __init param_sysfs_builtin(void) dot = strchr(kp->name, '.'); if (!dot) { /* This happens for core_param() */ - strcpy(modname, "kernel"); + strscpy(modname, "kernel"); name_len = 0; } else { name_len = dot - kp->name + 1; -- cgit v1.2.3 From a2c1f82618b0b65f1ef615aa9cfdac8122537d69 Mon Sep 17 00:00:00 2001 From: "Adrian Huang (Lenovo)" Date: Mon, 18 Aug 2025 21:43:10 +0800 Subject: signal: Fix memory leak for PIDFD_SELF* sentinels Commit f08d0c3a7111 ("pidfd: add PIDFD_SELF* sentinels to refer to own thread/process") introduced a leak by acquiring a pid reference through get_task_pid(), which increments pid->count but never drops it with put_pid(). As a result, kmemleak reports unreferenced pid objects after running tools/testing/selftests/pidfd/pidfd_test, for example: unreferenced object 0xff1100206757a940 (size 160): comm "pidfd_test", pid 16965, jiffies 4294853028 hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 00 00 00 00 fd 57 50 04 .............WP. 5e 44 00 00 00 00 00 00 18 de 34 17 01 00 11 ff ^D........4..... backtrace (crc cd8844d4): kmem_cache_alloc_noprof+0x2f4/0x3f0 alloc_pid+0x54/0x3d0 copy_process+0xd58/0x1740 kernel_clone+0x99/0x3b0 __do_sys_clone3+0xbe/0x100 do_syscall_64+0x7b/0x2c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix this by calling put_pid() after do_pidfd_send_signal() returns. Fixes: f08d0c3a7111 ("pidfd: add PIDFD_SELF* sentinels to refer to own thread/process") Signed-off-by: Adrian Huang (Lenovo) Link: https://lore.kernel.org/20250818134310.12273-1-adrianhuang0701@gmail.com Tested-by: Lorenzo Stoakes Reviewed-by: Lorenzo Stoakes Signed-off-by: Christian Brauner --- kernel/signal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e2c928de7d2c..fe9190d84f28 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4067,6 +4067,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, { struct pid *pid; enum pid_type type; + int ret; /* Enforce flags be set to 0 until we add an extension. */ if (flags & ~PIDFD_SEND_SIGNAL_FLAGS) @@ -4108,7 +4109,10 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, } } - return do_pidfd_send_signal(pid, sig, type, info, flags); + ret = do_pidfd_send_signal(pid, sig, type, info, flags); + put_pid(pid); + + return ret; } static int -- cgit v1.2.3 From 63b17b653df30e90f95338083cb44c35d64bcae4 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 8 Aug 2025 20:18:02 +0000 Subject: kho: init new_physxa->phys_bits to fix lockdep Patch series "Several KHO Hotfixes". Three unrelated fixes for Kexec Handover. This patch (of 3): Lockdep shows the following warning: INFO: trying to register non-static key. The code is fine but needs lockdep annotation, or maybe you didn't initialize this object before use? turning off the locking correctness validator. [] dump_stack_lvl+0x66/0xa0 [] assign_lock_key+0x10c/0x120 [] register_lock_class+0xf4/0x2f0 [] __lock_acquire+0x7f/0x2c40 [] ? __pfx_hlock_conflict+0x10/0x10 [] ? native_flush_tlb_global+0x8e/0xa0 [] ? __flush_tlb_all+0x4e/0xa0 [] ? __kernel_map_pages+0x112/0x140 [] ? xa_load_or_alloc+0x67/0xe0 [] lock_acquire+0xe6/0x280 [] ? xa_load_or_alloc+0x67/0xe0 [] _raw_spin_lock+0x30/0x40 [] ? xa_load_or_alloc+0x67/0xe0 [] xa_load_or_alloc+0x67/0xe0 [] kho_preserve_folio+0x90/0x100 [] __kho_finalize+0xcf/0x400 [] kho_finalize+0x34/0x70 This is becase xa has its own lock, that is not initialized in xa_load_or_alloc. Modifiy __kho_preserve_order(), to properly call xa_init(&new_physxa->phys_bits); Link: https://lkml.kernel.org/r/20250808201804.772010-2-pasha.tatashin@soleen.com Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pasha Tatashin Acked-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Changyuan Lyu Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Cc: Pratyush Yadav Cc: Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index e49743ae52c5..65145972d6d6 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -144,14 +144,34 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, unsigned int order) { struct kho_mem_phys_bits *bits; - struct kho_mem_phys *physxa; + struct kho_mem_phys *physxa, *new_physxa; const unsigned long pfn_high = pfn >> order; might_sleep(); - physxa = xa_load_or_alloc(&track->orders, order, sizeof(*physxa)); - if (IS_ERR(physxa)) - return PTR_ERR(physxa); + physxa = xa_load(&track->orders, order); + if (!physxa) { + int err; + + new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL); + if (!new_physxa) + return -ENOMEM; + + xa_init(&new_physxa->phys_bits); + physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa, + GFP_KERNEL); + + err = xa_err(physxa); + if (err || physxa) { + xa_destroy(&new_physxa->phys_bits); + kfree(new_physxa); + + if (err) + return err; + } else { + physxa = new_physxa; + } + } bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, sizeof(*bits)); -- cgit v1.2.3 From 8b66ed2c3f42cc462e05704af6b94e6a7bad2f5e Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 8 Aug 2025 20:18:03 +0000 Subject: kho: mm: don't allow deferred struct page with KHO KHO uses struct pages for the preserved memory early in boot, however, with deferred struct page initialization, only a small portion of memory has properly initialized struct pages. This problem was detected where vmemmap is poisoned, and illegal flag combinations are detected. Don't allow them to be enabled together, and later we will have to teach KHO to work properly with deferred struct page init kernel feature. Link: https://lkml.kernel.org/r/20250808201804.772010-3-pasha.tatashin@soleen.com Fixes: 4e1d010e3bda ("kexec: add config option for KHO") Signed-off-by: Pasha Tatashin Acked-by: Mike Rapoport (Microsoft) Acked-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Changyuan Lyu Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Cc: Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 2ee603a98813..1224dd937df0 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -97,6 +97,7 @@ config KEXEC_JUMP config KEXEC_HANDOVER bool "kexec handover" depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE + depends on !DEFERRED_STRUCT_PAGE_INIT select MEMBLOCK_KHO_SCRATCH select KEXEC_FILE select DEBUG_FS -- cgit v1.2.3 From 44958f2025ed3f29fc3e93bb1f6c16121d7847ad Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 8 Aug 2025 20:18:04 +0000 Subject: kho: warn if KHO is disabled due to an error During boot scratch area is allocated based on command line parameters or auto calculated. However, scratch area may fail to allocate, and in that case KHO is disabled. Currently, no warning is printed that KHO is disabled, which makes it confusing for the end user to figure out why KHO is not available. Add the missing warning message. Link: https://lkml.kernel.org/r/20250808201804.772010-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: Mike Rapoport (Microsoft) Acked-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Changyuan Lyu Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Cc: Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 65145972d6d6..ecd1ac210dbd 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -564,6 +564,7 @@ err_free_scratch_areas: err_free_scratch_desc: memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch)); err_disable_kho: + pr_warn("Failed to reserve scratch area, disabling kexec handover\n"); kho_enable = false; } -- cgit v1.2.3 From 6a909ea83f226803ea0e718f6e88613df9234d58 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Wed, 13 Aug 2025 04:02:32 +0000 Subject: tracing: Limit access to parser->buffer when trace_get_user failed When the length of the string written to set_ftrace_filter exceeds FTRACE_BUFF_MAX, the following KASAN alarm will be triggered: BUG: KASAN: slab-out-of-bounds in strsep+0x18c/0x1b0 Read of size 1 at addr ffff0000d00bd5ba by task ash/165 CPU: 1 UID: 0 PID: 165 Comm: ash Not tainted 6.16.0-g6bcdbd62bd56-dirty Hardware name: linux,dummy-virt (DT) Call trace: show_stack+0x34/0x50 (C) dump_stack_lvl+0xa0/0x158 print_address_description.constprop.0+0x88/0x398 print_report+0xb0/0x280 kasan_report+0xa4/0xf0 __asan_report_load1_noabort+0x20/0x30 strsep+0x18c/0x1b0 ftrace_process_regex.isra.0+0x100/0x2d8 ftrace_regex_release+0x484/0x618 __fput+0x364/0xa58 ____fput+0x28/0x40 task_work_run+0x154/0x278 do_notify_resume+0x1f0/0x220 el0_svc+0xec/0xf0 el0t_64_sync_handler+0xa0/0xe8 el0t_64_sync+0x1ac/0x1b0 The reason is that trace_get_user will fail when processing a string longer than FTRACE_BUFF_MAX, but not set the end of parser->buffer to 0. Then an OOB access will be triggered in ftrace_regex_release-> ftrace_process_regex->strsep->strpbrk. We can solve this problem by limiting access to parser->buffer when trace_get_user failed. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20250813040232.1344527-1-pulehui@huaweicloud.com Fixes: 8c9af478c06b ("ftrace: Handle commands when closing set_ftrace_filter file") Signed-off-by: Pu Lehui Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 18 ++++++++++++------ kernel/trace/trace.h | 8 +++++++- 2 files changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4283ed4e8f59..8d8935ed416d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1816,7 +1816,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, ret = get_user(ch, ubuf++); if (ret) - return ret; + goto fail; read++; cnt--; @@ -1830,7 +1830,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, while (cnt && isspace(ch)) { ret = get_user(ch, ubuf++); if (ret) - return ret; + goto fail; read++; cnt--; } @@ -1848,12 +1848,14 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, while (cnt && !isspace(ch) && ch) { if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; - else - return -EINVAL; + else { + ret = -EINVAL; + goto fail; + } ret = get_user(ch, ubuf++); if (ret) - return ret; + goto fail; read++; cnt--; } @@ -1868,11 +1870,15 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, /* Make sure the parsed string always terminates with '\0'. */ parser->buffer[parser->idx] = 0; } else { - return -EINVAL; + ret = -EINVAL; + goto fail; } *ppos += read; return read; +fail: + trace_parser_fail(parser); + return ret; } /* TODO add a seq_buf_to_buffer() */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1dbf1d3cf2f1..be6654899cae 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1292,6 +1292,7 @@ bool ftrace_event_is_function(struct trace_event_call *call); */ struct trace_parser { bool cont; + bool fail; char *buffer; unsigned idx; unsigned size; @@ -1299,7 +1300,7 @@ struct trace_parser { static inline bool trace_parser_loaded(struct trace_parser *parser) { - return (parser->idx != 0); + return !parser->fail && parser->idx != 0; } static inline bool trace_parser_cont(struct trace_parser *parser) @@ -1313,6 +1314,11 @@ static inline void trace_parser_clear(struct trace_parser *parser) parser->idx = 0; } +static inline void trace_parser_fail(struct trace_parser *parser) +{ + parser->fail = true; +} + extern int trace_parser_get_init(struct trace_parser *parser, int size); extern void trace_parser_put(struct trace_parser *parser); extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, -- cgit v1.2.3 From cd6e4faba96fe41d6b686e144b96dad5e6f2e771 Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Wed, 13 Aug 2025 17:51:14 +0800 Subject: ring-buffer: Remove redundant semicolons Remove unnecessary semicolons. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250813095114.559530-1-liaoyuanhong@vivo.com Signed-off-by: Liao Yuanhong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bb71a0dc9d69..43460949ad3f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7666,7 +7666,7 @@ static __init int test_ringbuffer(void) rb_test_started = true; set_current_state(TASK_INTERRUPTIBLE); - /* Just run for 10 seconds */; + /* Just run for 10 seconds */ schedule_timeout(10 * HZ); kthread_stop(rb_hammer); -- cgit v1.2.3 From edede7a6dcd7435395cf757d053974aaab6ab1c2 Mon Sep 17 00:00:00 2001 From: Ye Weihua Date: Mon, 18 Aug 2025 07:33:32 +0000 Subject: trace/fgraph: Fix the warning caused by missing unregister notifier This warning was triggered during testing on v6.16: notifier callback ftrace_suspend_notifier_call already registered WARNING: CPU: 2 PID: 86 at kernel/notifier.c:23 notifier_chain_register+0x44/0xb0 ... Call Trace: blocking_notifier_chain_register+0x34/0x60 register_ftrace_graph+0x330/0x410 ftrace_profile_write+0x1e9/0x340 vfs_write+0xf8/0x420 ? filp_flush+0x8a/0xa0 ? filp_close+0x1f/0x30 ? do_dup2+0xaf/0x160 ksys_write+0x65/0xe0 do_syscall_64+0xa4/0x260 entry_SYSCALL_64_after_hwframe+0x77/0x7f When writing to the function_profile_enabled interface, the notifier was not unregistered after start_graph_tracing failed, causing a warning the next time function_profile_enabled was written. Fixed by adding unregister_pm_notifier in the exception path. Link: https://lore.kernel.org/20250818073332.3890629-1-yeweihua4@huawei.com Fixes: 4a2b8dda3f870 ("tracing/function-graph-tracer: fix a regression while suspend to disk") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Ye Weihua Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index f4d200f0c610..2a42c1036ea8 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1397,6 +1397,7 @@ error: ftrace_graph_active--; gops->saved_func = NULL; fgraph_lru_release_index(i); + unregister_pm_notifier(&ftrace_suspend_notifier); } return ret; } -- cgit v1.2.3 From ec879e1a0be8007aa232ffedcf6a6445dfc1a3d7 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 16 Aug 2025 23:10:51 +0900 Subject: tracing: fprobe-event: Sanitize wildcard for fprobe event name Fprobe event accepts wildcards for the target functions, but unless user specifies its event name, it makes an event with the wildcards. /sys/kernel/tracing # echo 'f mutex*' >> dynamic_events /sys/kernel/tracing # cat dynamic_events f:fprobes/mutex*__entry mutex* /sys/kernel/tracing # ls events/fprobes/ enable filter mutex*__entry To fix this, replace the wildcard ('*') with an underscore. Link: https://lore.kernel.org/all/175535345114.282990.12294108192847938710.stgit@devnote2/ Fixes: 334e5519c375 ("tracing/probes: Add fprobe events for tracing function entry and exit.") Signed-off-by: Masami Hiramatsu (Google) Cc: stable@vger.kernel.org --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1dbf1d3cf2f1..5a6688832da8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2204,7 +2204,7 @@ static inline bool is_good_system_name(const char *name) static inline void sanitize_event_name(char *name) { while (*name++ != '\0') - if (*name == ':' || *name == '.') + if (*name == ':' || *name == '.' || *name == '*') *name = '_'; } -- cgit v1.2.3 From e3d01979e4bff5c87eb4054a22e7568bb679b1fe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Aug 2025 19:55:22 -0400 Subject: fgraph: Copy args in intermediate storage with entry The output of the function graph tracer has two ways to display its entries. One way for leaf functions with no events recorded within them, and the other is for functions with events recorded inside it. As function graph has an entry and exit event, to simplify the output of leaf functions it combines the two, where as non leaf functions are separate: 2) | invoke_rcu_core() { 2) | raise_softirq() { 2) 0.391 us | __raise_softirq_irqoff(); 2) 1.191 us | } 2) 2.086 us | } The __raise_softirq_irqoff() function above is really two events that were merged into one. Otherwise it would have looked like: 2) | invoke_rcu_core() { 2) | raise_softirq() { 2) | __raise_softirq_irqoff() { 2) 0.391 us | } 2) 1.191 us | } 2) 2.086 us | } In order to do this merge, the reading of the trace output file needs to look at the next event before printing. But since the pointer to the event is on the ring buffer, it needs to save the entry event before it looks at the next event as the next event goes out of focus as soon as a new event is read from the ring buffer. After it reads the next event, it will print the entry event with either the '{' (non leaf) or ';' and timestamps (leaf). The iterator used to read the trace file has storage for this event. The problem happens when the function graph tracer has arguments attached to the entry event as the entry now has a variable length "args" field. This field only gets set when funcargs option is used. But the args are not recorded in this temp data and garbage could be printed. The entry field is copied via: data->ent = *curr; Where "curr" is the entry field. But this method only saves the non variable length fields from the structure. Add a helper structure to the iterator data that adds the max args size to the data storage in the iterator. Then simply copy the entire entry into this storage (with size protection). Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Link: https://lore.kernel.org/20250820195522.51d4a268@gandalf.local.home Reported-by: Sasha Levin Tested-by: Sasha Levin Closes: https://lore.kernel.org/all/aJaxRVKverIjF4a6@lappy/ Fixes: ff5c9c576e75 ("ftrace: Add support for function argument to graph tracer") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 66e1a527cf1a..a7f4b9a47a71 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -27,14 +27,21 @@ struct fgraph_cpu_data { unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; }; +struct fgraph_ent_args { + struct ftrace_graph_ent_entry ent; + /* Force the sizeof of args[] to have FTRACE_REGS_MAX_ARGS entries */ + unsigned long args[FTRACE_REGS_MAX_ARGS]; +}; + struct fgraph_data { struct fgraph_cpu_data __percpu *cpu_data; /* Place to preserve last processed entry. */ union { - struct ftrace_graph_ent_entry ent; + struct fgraph_ent_args ent; + /* TODO allow retaddr to have args */ struct fgraph_retaddr_ent_entry rent; - } ent; + }; struct ftrace_graph_ret_entry ret; int failed; int cpu; @@ -627,10 +634,13 @@ get_return_for_leaf(struct trace_iterator *iter, * Save current and next entries for later reference * if the output fails. */ - if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT)) - data->ent.rent = *(struct fgraph_retaddr_ent_entry *)curr; - else - data->ent.ent = *curr; + if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT)) { + data->rent = *(struct fgraph_retaddr_ent_entry *)curr; + } else { + int size = min((int)sizeof(data->ent), (int)iter->ent_size); + + memcpy(&data->ent, curr, size); + } /* * If the next event is not a return type, then * we only care about what type it is. Otherwise we can -- cgit v1.2.3 From 4013aef2ced9b756a410f50d12df9ebe6a883e4a Mon Sep 17 00:00:00 2001 From: Tengda Wu Date: Fri, 22 Aug 2025 03:33:43 +0000 Subject: ftrace: Fix potential warning in trace_printk_seq during ftrace_dump When calling ftrace_dump_one() concurrently with reading trace_pipe, a WARN_ON_ONCE() in trace_printk_seq() can be triggered due to a race condition. The issue occurs because: CPU0 (ftrace_dump) CPU1 (reader) echo z > /proc/sysrq-trigger !trace_empty(&iter) trace_iterator_reset(&iter) <- len = size = 0 cat /sys/kernel/tracing/trace_pipe trace_find_next_entry_inc(&iter) __find_next_entry ring_buffer_empty_cpu <- all empty return NULL trace_printk_seq(&iter.seq) WARN_ON_ONCE(s->seq.len >= s->seq.size) In the context between trace_empty() and trace_find_next_entry_inc() during ftrace_dump, the ring buffer data was consumed by other readers. This caused trace_find_next_entry_inc to return NULL, failing to populate `iter.seq`. At this point, due to the prior trace_iterator_reset, both `iter.seq.len` and `iter.seq.size` were set to 0. Since they are equal, the WARN_ON_ONCE condition is triggered. Move the trace_printk_seq() into the if block that checks to make sure the return value of trace_find_next_entry_inc() is non-NULL in ftrace_dump_one(), ensuring the 'iter.seq' is properly populated before subsequent operations. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Ingo Molnar Link: https://lore.kernel.org/20250822033343.3000289-1-wutengda@huaweicloud.com Fixes: d769041f8653 ("ring_buffer: implement new locking") Signed-off-by: Tengda Wu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8d8935ed416d..1b7db732c0b1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10638,10 +10638,10 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m ret = print_trace_line(&iter); if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(&iter); + + trace_printk_seq(&iter.seq); } touch_nmi_watchdog(); - - trace_printk_seq(&iter.seq); } if (!cnt) -- cgit v1.2.3 From bfb336cf97df7b37b2b2edec0f69773e06d11955 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 22 Aug 2025 18:36:06 -0400 Subject: ftrace: Also allocate and copy hash for reading of filter files Currently the reader of set_ftrace_filter and set_ftrace_notrace just adds the pointer to the global tracer hash to its iterator. Unlike the writer that allocates a copy of the hash, the reader keeps the pointer to the filter hashes. This is problematic because this pointer is static across function calls that release the locks that can update the global tracer hashes. This can cause UAF and similar bugs. Allocate and copy the hash for reading the filter files like it is done for the writers. This not only fixes UAF bugs, but also makes the code a bit simpler as it doesn't have to differentiate when to free the iterator's hash between writers and readers. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nathan Chancellor Cc: Linus Torvalds Link: https://lore.kernel.org/20250822183606.12962cc3@batman.local.home Fixes: c20489dad156 ("ftrace: Assign iter->hash to filter or notrace hashes on seq read") Closes: https://lore.kernel.org/all/20250813023044.2121943-1-wutengda@huaweicloud.com/ Closes: https://lore.kernel.org/all/20250822192437.GA458494@ax162/ Reported-by: Tengda Wu Tested-by: Tengda Wu Tested-by: Nathan Chancellor Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 00b76d450a89..a69067367c29 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4661,13 +4661,17 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, } else { iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); } + } else { + if (hash) + iter->hash = alloc_and_copy_ftrace_hash(hash->size_bits, hash); + else + iter->hash = EMPTY_HASH; + } - if (!iter->hash) { - trace_parser_put(&iter->parser); - goto out_unlock; - } - } else - iter->hash = hash; + if (!iter->hash) { + trace_parser_put(&iter->parser); + goto out_unlock; + } ret = 0; @@ -6543,9 +6547,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file) ftrace_hash_move_and_update_ops(iter->ops, orig_hash, iter->hash, filter_hash); mutex_unlock(&ftrace_lock); - } else { - /* For read only, the hash is the ops hash */ - iter->hash = NULL; } mutex_unlock(&iter->ops->func_hash->regex_lock); -- cgit v1.2.3