From 1a251f52cfdc417c84411a056bc142cbd77baef4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 15:49:18 -0700 Subject: minmax: make generic MIN() and MAX() macros available everywhere This just standardizes the use of MIN() and MAX() macros, with the very traditional semantics. The goal is to use these for C constant expressions and for top-level / static initializers, and so be able to simplify the min()/max() macros. These macro names were used by various kernel code - they are very traditional, after all - and all such users have been fixed up, with a few different approaches: - trivial duplicated macro definitions have been removed Note that 'trivial' here means that it's obviously kernel code that already included all the major kernel headers, and thus gets the new generic MIN/MAX macros automatically. - non-trivial duplicated macro definitions are guarded with #ifndef This is the "yes, they define their own versions, but no, the include situation is not entirely obvious, and maybe they don't get the generic version automatically" case. - strange use case #1 A couple of drivers decided that the way they want to describe their versioning is with #define MAJ 1 #define MIN 2 #define DRV_VERSION __stringify(MAJ) "." __stringify(MIN) which adds zero value and I just did my Alexander the Great impersonation, and rewrote that pointless Gordian knot as #define DRV_VERSION "1.2" instead. - strange use case #2 A couple of drivers thought that it's a good idea to have a random 'MIN' or 'MAX' define for a value or index into a table, rather than the traditional macro that takes arguments. These values were re-written as C enum's instead. The new function-line macros only expand when followed by an open parenthesis, and thus don't clash with enum use. Happily, there weren't really all that many of these cases, and a lot of users already had the pattern of using '#ifndef' guarding (or in one case just using '#undef MIN') before defining their own private version that does the same thing. I left such cases alone. Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- kernel/trace/preemptirq_delay_test.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index cb0871fbdb07..314ffc143039 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -34,8 +34,6 @@ MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on"); static struct completion done; -#define MIN(x, y) ((x) < (y) ? (x) : (y)) - static void busy_wait(ulong time) { u64 start, end; -- cgit v1.2.3 From 6623b0217d0c9bed80bfa43b778ce1c0eb03b497 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 21 Jul 2024 18:45:41 +0200 Subject: locking/pvqspinlock: Correct the type of "old" variable in pv_kick_node() "enum vcpu_state" is not compatible with "u8" type for all targets, resulting in: error: initialization of 'u8 *' {aka 'unsigned char *'} from incompatible pointer type 'enum vcpu_state *' for LoongArch. Correct the type of "old" variable to "u8". Fixes: fea0e1820b51 ("locking/pvqspinlock: Use try_cmpxchg() in qspinlock_paravirt.h") Closes: https://lore.kernel.org/lkml/20240719024010.3296488-1-maobibo@loongson.cn/ Reported-by: Bibo Mao Signed-off-by: Uros Bizjak Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lore.kernel.org/r/20240721164552.50175-1-ubizjak@gmail.com --- kernel/locking/qspinlock_paravirt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index f5a36e67b593..ac2e22502741 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -357,7 +357,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - enum vcpu_state old = vcpu_halted; + u8 old = vcpu_halted; /* * If the vCPU is indeed halted, advance its state to match that of * pv_wait_node(). If OTOH this fails, the vCPU was running and will -- cgit v1.2.3 From 77baa5bafcbe1b2a15ef9c37232c21279c95481c Mon Sep 17 00:00:00 2001 From: Zheng Zucheng Date: Fri, 26 Jul 2024 02:32:35 +0000 Subject: sched/cputime: Fix mul_u64_u64_div_u64() precision for cputime In extreme test scenarios: the 14th field utime in /proc/xx/stat is greater than sum_exec_runtime, utime = 18446744073709518790 ns, rtime = 135989749728000 ns In cputime_adjust() process, stime is greater than rtime due to mul_u64_u64_div_u64() precision problem. before call mul_u64_u64_div_u64(), stime = 175136586720000, rtime = 135989749728000, utime = 1416780000. after call mul_u64_u64_div_u64(), stime = 135989949653530 unsigned reversion occurs because rtime is less than stime. utime = rtime - stime = 135989749728000 - 135989949653530 = -199925530 = (u64)18446744073709518790 Trigger condition: 1). User task run in kernel mode most of time 2). ARM64 architecture 3). TICK_CPU_ACCOUNTING=y CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is not set Fix mul_u64_u64_div_u64() conversion precision by reset stime to rtime Fixes: 3dc167ba5729 ("sched/cputime: Improve cputime_adjust()") Signed-off-by: Zheng Zucheng Signed-off-by: Peter Zijlstra (Intel) Cc: Link: https://lkml.kernel.org/r/20240726023235.217771-1-zhengzucheng@huawei.com --- kernel/sched/cputime.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a5e00293ae43..0bed0fa1acd9 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -582,6 +582,12 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, } stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); + /* + * Because mul_u64_u64_div_u64() can approximate on some + * achitectures; enforce the constraint that: a*b/(b+c) <= a. + */ + if (unlikely(stime > rtime)) + stime = rtime; update: /* -- cgit v1.2.3 From 31b164e2e4af84d08d2498083676e7eeaa102493 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 3 Jul 2024 11:16:07 +0800 Subject: sched/smt: Introduce sched_smt_present_inc/dec() helper Introduce sched_smt_present_inc/dec() helper, so it can be called in normal or error path simply. No functional changed. Cc: stable@kernel.org Signed-off-by: Yang Yingliang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240703031610.587047-2-yangyingliang@huaweicloud.com --- kernel/sched/core.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a9f655025607..acc04ed9dbc2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7895,6 +7895,22 @@ static int cpuset_cpu_inactive(unsigned int cpu) return 0; } +static inline void sched_smt_present_inc(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_inc_cpuslocked(&sched_smt_present); +#endif +} + +static inline void sched_smt_present_dec(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_dec_cpuslocked(&sched_smt_present); +#endif +} + int sched_cpu_activate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); @@ -7906,13 +7922,10 @@ int sched_cpu_activate(unsigned int cpu) */ balance_push_set(cpu, false); -#ifdef CONFIG_SCHED_SMT /* * When going up, increment the number of cores with SMT present. */ - if (cpumask_weight(cpu_smt_mask(cpu)) == 2) - static_branch_inc_cpuslocked(&sched_smt_present); -#endif + sched_smt_present_inc(cpu); set_cpu_active(cpu, true); if (sched_smp_initialized) { @@ -7981,13 +7994,12 @@ int sched_cpu_deactivate(unsigned int cpu) } rq_unlock_irqrestore(rq, &rf); -#ifdef CONFIG_SCHED_SMT /* * When going down, decrement the number of cores with SMT present. */ - if (cpumask_weight(cpu_smt_mask(cpu)) == 2) - static_branch_dec_cpuslocked(&sched_smt_present); + sched_smt_present_dec(cpu); +#ifdef CONFIG_SCHED_SMT sched_core_cpu_deactivate(cpu); #endif -- cgit v1.2.3 From e22f910a26cc2a3ac9c66b8e935ef2a7dd881117 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 3 Jul 2024 11:16:08 +0800 Subject: sched/smt: Fix unbalance sched_smt_present dec/inc I got the following warn report while doing stress test: jump label: negative count! WARNING: CPU: 3 PID: 38 at kernel/jump_label.c:263 static_key_slow_try_dec+0x9d/0xb0 Call Trace: __static_key_slow_dec_cpuslocked+0x16/0x70 sched_cpu_deactivate+0x26e/0x2a0 cpuhp_invoke_callback+0x3ad/0x10d0 cpuhp_thread_fun+0x3f5/0x680 smpboot_thread_fn+0x56d/0x8d0 kthread+0x309/0x400 ret_from_fork+0x41/0x70 ret_from_fork_asm+0x1b/0x30 Because when cpuset_cpu_inactive() fails in sched_cpu_deactivate(), the cpu offline failed, but sched_smt_present is decremented before calling sched_cpu_deactivate(), it leads to unbalanced dec/inc, so fix it by incrementing sched_smt_present in the error path. Fixes: c5511d03ec09 ("sched/smt: Make sched_smt_present track topology") Cc: stable@kernel.org Signed-off-by: Yang Yingliang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chen Yu Reviewed-by: Tim Chen Link: https://lore.kernel.org/r/20240703031610.587047-3-yangyingliang@huaweicloud.com --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index acc04ed9dbc2..949473e414f9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8009,6 +8009,7 @@ int sched_cpu_deactivate(unsigned int cpu) sched_update_numa(cpu, false); ret = cpuset_cpu_inactive(cpu); if (ret) { + sched_smt_present_inc(cpu); balance_push_set(cpu, false); set_cpu_active(cpu, true); sched_update_numa(cpu, true); -- cgit v1.2.3 From 2f027354122f58ee846468a6f6b48672fff92e9b Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 3 Jul 2024 11:16:09 +0800 Subject: sched/core: Introduce sched_set_rq_on/offline() helper Introduce sched_set_rq_on/offline() helper, so it can be called in normal or error path simply. No functional changed. Cc: stable@kernel.org Signed-off-by: Yang Yingliang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240703031610.587047-4-yangyingliang@huaweicloud.com --- kernel/sched/core.c | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 949473e414f9..4d119e930beb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7845,6 +7845,30 @@ void set_rq_offline(struct rq *rq) } } +static inline void sched_set_rq_online(struct rq *rq, int cpu) +{ + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + rq_unlock_irqrestore(rq, &rf); +} + +static inline void sched_set_rq_offline(struct rq *rq, int cpu) +{ + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + rq_unlock_irqrestore(rq, &rf); +} + /* * used to mark begin/end of suspend/resume: */ @@ -7914,7 +7938,6 @@ static inline void sched_smt_present_dec(int cpu) int sched_cpu_activate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; /* * Clear the balance_push callback and prepare to schedule @@ -7943,12 +7966,7 @@ int sched_cpu_activate(unsigned int cpu) * 2) At runtime, if cpuset_cpu_active() fails to rebuild the * domains. */ - rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_online(rq); - } - rq_unlock_irqrestore(rq, &rf); + sched_set_rq_online(rq, cpu); return 0; } @@ -7956,7 +7974,6 @@ int sched_cpu_activate(unsigned int cpu) int sched_cpu_deactivate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; int ret; /* @@ -7987,12 +8004,7 @@ int sched_cpu_deactivate(unsigned int cpu) */ synchronize_rcu(); - rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } - rq_unlock_irqrestore(rq, &rf); + sched_set_rq_offline(rq, cpu); /* * When going down, decrement the number of cores with SMT present. -- cgit v1.2.3 From fe7a11c78d2a9bdb8b50afc278a31ac177000948 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 3 Jul 2024 11:16:10 +0800 Subject: sched/core: Fix unbalance set_rq_online/offline() in sched_cpu_deactivate() If cpuset_cpu_inactive() fails, set_rq_online() need be called to rollback. Fixes: 120455c514f7 ("sched: Fix hotplug vs CPU bandwidth control") Cc: stable@kernel.org Signed-off-by: Yang Yingliang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240703031610.587047-5-yangyingliang@huaweicloud.com --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d119e930beb..f3951e4a55e5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8022,6 +8022,7 @@ int sched_cpu_deactivate(unsigned int cpu) ret = cpuset_cpu_inactive(cpu); if (ret) { sched_smt_present_inc(cpu); + sched_set_rq_online(rq, cpu); balance_push_set(cpu, false); set_cpu_active(cpu, true); sched_update_numa(cpu, true); -- cgit v1.2.3 From 7c51f7bbf057f82aeba3390c39ef61b244181c09 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 27 Jul 2024 19:59:57 +0900 Subject: profiling: remove prof_cpu_mask syzbot is reporting uninit-value at profile_hits(), for there is a race window between if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) return -ENOMEM; cpumask_copy(prof_cpu_mask, cpu_possible_mask); in profile_init() and cpumask_available(prof_cpu_mask) && cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) in profile_tick(); prof_cpu_mask remains uninitialzed until cpumask_copy() completes while cpumask_available(prof_cpu_mask) returns true as soon as alloc_cpumask_var(&prof_cpu_mask) completes. We could replace alloc_cpumask_var() with zalloc_cpumask_var() and call cpumask_copy() from create_proc_profile() on only UP kernels, for profile_online_cpu() calls cpumask_set_cpu() as needed via cpuhp_setup_state(CPUHP_AP_ONLINE_DYN) on SMP kernels. But this patch removes prof_cpu_mask because it seems unnecessary. The cpumask_test_cpu(smp_processor_id(), prof_cpu_mask) test in profile_tick() is likely always true due to a CPU cannot call profile_tick() if that CPU is offline and cpumask_set_cpu(cpu, prof_cpu_mask) is called when that CPU becomes online and cpumask_clear_cpu(cpu, prof_cpu_mask) is called when that CPU becomes offline . This test could be false during transition between online and offline. But according to include/linux/cpuhotplug.h , CPUHP_PROFILE_PREPARE belongs to PREPARE section, which means that the CPU subjected to profile_dead_cpu() cannot be inside profile_tick() (i.e. no risk of use-after-free bug) because interrupt for that CPU is disabled during PREPARE section. Therefore, this test is guaranteed to be true, and can be removed. (Since profile_hits() checks prof_buffer != NULL, we don't need to check prof_buffer != NULL here unless get_irq_regs() or user_mode() is such slow that we want to avoid when prof_buffer == NULL). do_profile_hits() is called from profile_tick() from timer interrupt only if cpumask_test_cpu(smp_processor_id(), prof_cpu_mask) is true and prof_buffer is not NULL. But syzbot is also reporting that sometimes do_profile_hits() is called while current thread is still doing vzalloc(), where prof_buffer must be NULL at this moment. This indicates that multiple threads concurrently tried to write to /sys/kernel/profiling interface, which caused that somebody else try to re-allocate prof_buffer despite somebody has already allocated prof_buffer. Fix this by using serialization. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=b1a83ab2a9eb9321fbdd Signed-off-by: Tetsuo Handa Tested-by: syzbot Signed-off-by: Linus Torvalds --- kernel/ksysfs.c | 7 +++++++ kernel/profile.c | 46 ++++++---------------------------------------- 2 files changed, 13 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 07fb5987b42b..1bab21b4718f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -92,7 +92,14 @@ static ssize_t profiling_store(struct kobject *kobj, const char *buf, size_t count) { int ret; + static DEFINE_MUTEX(lock); + /* + * We need serialization, for profile_setup() initializes prof_on + * value and profile_init() must not reallocate prof_buffer after + * once allocated. + */ + guard(mutex)(&lock); if (prof_on) return -EEXIST; /* diff --git a/kernel/profile.c b/kernel/profile.c index 2b775cc5c28f..4654c6cd984e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -47,7 +47,6 @@ static unsigned short int prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); -static cpumask_var_t prof_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); @@ -114,11 +113,6 @@ int __ref profile_init(void) buffer_bytes = prof_len*sizeof(atomic_t); - if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_copy(prof_cpu_mask, cpu_possible_mask); - prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; @@ -132,7 +126,6 @@ int __ref profile_init(void) if (prof_buffer) return 0; - free_cpumask_var(prof_cpu_mask); return -ENOMEM; } @@ -267,9 +260,6 @@ static int profile_dead_cpu(unsigned int cpu) struct page *page; int i; - if (cpumask_available(prof_cpu_mask)) - cpumask_clear_cpu(cpu, prof_cpu_mask); - for (i = 0; i < 2; i++) { if (per_cpu(cpu_profile_hits, cpu)[i]) { page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); @@ -302,14 +292,6 @@ static int profile_prepare_cpu(unsigned int cpu) return 0; } -static int profile_online_cpu(unsigned int cpu) -{ - if (cpumask_available(prof_cpu_mask)) - cpumask_set_cpu(cpu, prof_cpu_mask); - - return 0; -} - #else /* !CONFIG_SMP */ #define profile_flip_buffers() do { } while (0) #define profile_discard_flip_buffers() do { } while (0) @@ -334,8 +316,8 @@ void profile_tick(int type) { struct pt_regs *regs = get_irq_regs(); - if (!user_mode(regs) && cpumask_available(prof_cpu_mask) && - cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) + /* This is the old kernel-only legacy profiling */ + if (!user_mode(regs)) profile_hit(type, (void *)profile_pc(regs)); } @@ -418,10 +400,6 @@ static const struct proc_ops profile_proc_ops = { int __ref create_proc_profile(void) { struct proc_dir_entry *entry; -#ifdef CONFIG_SMP - enum cpuhp_state online_state; -#endif - int err = 0; if (!prof_on) @@ -431,26 +409,14 @@ int __ref create_proc_profile(void) profile_prepare_cpu, profile_dead_cpu); if (err) return err; - - err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE", - profile_online_cpu, NULL); - if (err < 0) - goto err_state_prep; - online_state = err; - err = 0; #endif entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &profile_proc_ops); - if (!entry) - goto err_state_onl; - proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); - - return err; -err_state_onl: + if (entry) + proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); #ifdef CONFIG_SMP - cpuhp_remove_state(online_state); -err_state_prep: - cpuhp_remove_state(CPUHP_PROFILE_PREPARE); + else + cpuhp_remove_state(CPUHP_PROFILE_PREPARE); #endif return err; } -- cgit v1.2.3 From 2accfdb7eff65f390c4308b0e9cb7c3fe48ad63c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Jul 2024 10:58:28 -0700 Subject: profiling: attempt to remove per-cpu profile flip buffer This is the really old legacy kernel profiling code, which has long since been obviated by "real profiling" (ie 'prof' and company), and mainly remains as a source of syzbot reports. There are anecdotal reports that people still use it for boot-time profiling, but it's unlikely that such use would care about the old NUMA optimizations in this code from 2004 (commit ad02973d42: "profile: 512x Altix timer interrupt livelock fix" in the BK import archive at [1]) So in order to head off future syzbot reports, let's try to simplify this code and get rid of the per-cpu profile buffers that are quite a large portion of the complexity footprint of this thing (including CPU hotplug callbacks etc). It's unlikely anybody will actually notice, or possibly, as Thomas put it: "Only people who indulge in nostalgia will notice :)". That said, if it turns out that this code is actually actively used by somebody, we can always revert this removal. Thus the "attempt" in the summary line. [ Note: in a small nod to "the profiling code can cause NUMA problems", this also removes the "increment the last entry in the profiling array on any unknown hits" logic. That would account any program counter in a module to that single counter location, and might exacerbate any NUMA cacheline bouncing issues ] Link: https://lore.kernel.org/all/CAHk-=wgs52BxT4Zjmjz8aNvHWKxf5_ThBY4bYL1Y6CTaNL2dTw@mail.gmail.com/ Link: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git [1] Cc: Thomas Gleixner Cc: Tetsuo Handa Signed-off-by: Linus Torvalds --- include/linux/cpuhotplug.h | 1 - kernel/profile.c | 183 +-------------------------------------------- 2 files changed, 2 insertions(+), 182 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 51ba681b915a..affdd890899e 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -100,7 +100,6 @@ enum cpuhp_state { CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, - CPUHP_PROFILE_PREPARE, CPUHP_X2APIC_PREPARE, CPUHP_SMPCFD_PREPARE, CPUHP_RELAY_PREPARE, diff --git a/kernel/profile.c b/kernel/profile.c index 4654c6cd984e..2dfb0f4e755c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -129,180 +129,13 @@ int __ref profile_init(void) return -ENOMEM; } -#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) -/* - * Each cpu has a pair of open-addressed hashtables for pending - * profile hits. read_profile() IPI's all cpus to request them - * to flip buffers and flushes their contents to prof_buffer itself. - * Flip requests are serialized by the profile_flip_mutex. The sole - * use of having a second hashtable is for avoiding cacheline - * contention that would otherwise happen during flushes of pending - * profile hits required for the accuracy of reported profile hits - * and so resurrect the interrupt livelock issue. - * - * The open-addressed hashtables are indexed by profile buffer slot - * and hold the number of pending hits to that profile buffer slot on - * a cpu in an entry. When the hashtable overflows, all pending hits - * are accounted to their corresponding profile buffer slots with - * atomic_add() and the hashtable emptied. As numerous pending hits - * may be accounted to a profile buffer slot in a hashtable entry, - * this amortizes a number of atomic profile buffer increments likely - * to be far larger than the number of entries in the hashtable, - * particularly given that the number of distinct profile buffer - * positions to which hits are accounted during short intervals (e.g. - * several seconds) is usually very small. Exclusion from buffer - * flipping is provided by interrupt disablement (note that for - * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from - * process context). - * The hash function is meant to be lightweight as opposed to strong, - * and was vaguely inspired by ppc64 firmware-supported inverted - * pagetable hash functions, but uses a full hashtable full of finite - * collision chains, not just pairs of them. - * - * -- nyc - */ -static void __profile_flip_buffers(void *unused) -{ - int cpu = smp_processor_id(); - - per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); -} - -static void profile_flip_buffers(void) -{ - int i, j, cpu; - - mutex_lock(&profile_flip_mutex); - j = per_cpu(cpu_profile_flip, get_cpu()); - put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 1); - for_each_online_cpu(cpu) { - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; - for (i = 0; i < NR_PROFILE_HIT; ++i) { - if (!hits[i].hits) { - if (hits[i].pc) - hits[i].pc = 0; - continue; - } - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); - hits[i].hits = hits[i].pc = 0; - } - } - mutex_unlock(&profile_flip_mutex); -} - -static void profile_discard_flip_buffers(void) -{ - int i, cpu; - - mutex_lock(&profile_flip_mutex); - i = per_cpu(cpu_profile_flip, get_cpu()); - put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 1); - for_each_online_cpu(cpu) { - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; - memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); - } - mutex_unlock(&profile_flip_mutex); -} - -static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) -{ - unsigned long primary, secondary, flags, pc = (unsigned long)__pc; - int i, j, cpu; - struct profile_hit *hits; - - pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); - i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; - secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; - cpu = get_cpu(); - hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; - if (!hits) { - put_cpu(); - return; - } - /* - * We buffer the global profiler buffer into a per-CPU - * queue and thus reduce the number of global (and possibly - * NUMA-alien) accesses. The write-queue is self-coalescing: - */ - local_irq_save(flags); - do { - for (j = 0; j < PROFILE_GRPSZ; ++j) { - if (hits[i + j].pc == pc) { - hits[i + j].hits += nr_hits; - goto out; - } else if (!hits[i + j].hits) { - hits[i + j].pc = pc; - hits[i + j].hits = nr_hits; - goto out; - } - } - i = (i + secondary) & (NR_PROFILE_HIT - 1); - } while (i != primary); - - /* - * Add the current hit(s) and flush the write-queue out - * to the global buffer: - */ - atomic_add(nr_hits, &prof_buffer[pc]); - for (i = 0; i < NR_PROFILE_HIT; ++i) { - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); - hits[i].pc = hits[i].hits = 0; - } -out: - local_irq_restore(flags); - put_cpu(); -} - -static int profile_dead_cpu(unsigned int cpu) -{ - struct page *page; - int i; - - for (i = 0; i < 2; i++) { - if (per_cpu(cpu_profile_hits, cpu)[i]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); - per_cpu(cpu_profile_hits, cpu)[i] = NULL; - __free_page(page); - } - } - return 0; -} - -static int profile_prepare_cpu(unsigned int cpu) -{ - int i, node = cpu_to_mem(cpu); - struct page *page; - - per_cpu(cpu_profile_flip, cpu) = 0; - - for (i = 0; i < 2; i++) { - if (per_cpu(cpu_profile_hits, cpu)[i]) - continue; - - page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); - if (!page) { - profile_dead_cpu(cpu); - return -ENOMEM; - } - per_cpu(cpu_profile_hits, cpu)[i] = page_address(page); - - } - return 0; -} - -#else /* !CONFIG_SMP */ -#define profile_flip_buffers() do { } while (0) -#define profile_discard_flip_buffers() do { } while (0) - static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long pc; pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; - atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); + if (pc < prof_len) + atomic_add(nr_hits, &prof_buffer[pc]); } -#endif /* !CONFIG_SMP */ void profile_hits(int type, void *__pc, unsigned int nr_hits) { @@ -340,7 +173,6 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) char *pnt; unsigned long sample_step = 1UL << prof_shift; - profile_flip_buffers(); if (p >= (prof_len+1)*sizeof(unsigned int)) return 0; if (count > (prof_len+1)*sizeof(unsigned int) - p) @@ -386,7 +218,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf, return -EINVAL; } #endif - profile_discard_flip_buffers(); memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); return count; } @@ -404,20 +235,10 @@ int __ref create_proc_profile(void) if (!prof_on) return 0; -#ifdef CONFIG_SMP - err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE", - profile_prepare_cpu, profile_dead_cpu); - if (err) - return err; -#endif entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &profile_proc_ops); if (entry) proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); -#ifdef CONFIG_SMP - else - cpuhp_remove_state(CPUHP_PROFILE_PREPARE); -#endif return err; } subsys_initcall(create_proc_profile); -- cgit v1.2.3 From cec6937dd1aae1b38d147bd190cb895d06cf96d0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Jul 2024 12:05:06 -0700 Subject: task_work: make TWA_NMI_CURRENT handling conditional on IRQ_WORK The TWA_NMI_CURRENT handling very much depends on IRQ_WORK, but that isn't universally enabled everywhere. Maybe the IRQ_WORK infrastructure should just be unconditional - x86 ends up indirectly enabling it through unconditionally enabling PERF_EVENTS, for example. But it also gets enabled by having SMP support, or even if you just have PRINTK enabled. But in the meantime TWA_NMI_CURRENT causes tons of build failures on various odd minimal configs. Which did show up in linux-next, but despite that nobody bothered to fix it or even inform me until -rc1 was out. Fixes: 466e4d801cd4 ("task_work: Add TWA_NMI_CURRENT as an additional notify mode") Reported-by: Naresh Kamboju Reported-by: kernelci.org bot Reported-by: Guenter Roeck Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Signed-off-by: Linus Torvalds --- kernel/task_work.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/task_work.c b/kernel/task_work.c index 5c2daa7ad3f9..5d14d639ac71 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -6,12 +6,14 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ +#ifdef CONFIG_IRQ_WORK static void task_work_set_notify_irq(struct irq_work *entry) { test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); } static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = IRQ_WORK_INIT_HARD(task_work_set_notify_irq); +#endif /** * task_work_add - ask the @task to execute @work->func() @@ -57,6 +59,8 @@ int task_work_add(struct task_struct *task, struct callback_head *work, if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; + if (!IS_ENABLED(CONFIG_IRQ_WORK)) + return -EINVAL; } else { /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack(work); @@ -81,9 +85,11 @@ int task_work_add(struct task_struct *task, struct callback_head *work, case TWA_SIGNAL_NO_IPI: __set_notify_signal(task); break; +#ifdef CONFIG_IRQ_WORK case TWA_NMI_CURRENT: irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); break; +#endif default: WARN_ON_ONCE(1); break; -- cgit v1.2.3 From 94ede2a3e9135764736221c080ac7c0ad993dc2d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Jul 2024 16:34:17 -0700 Subject: profiling: remove stale percpu flip buffer variables For some reason I didn't see this issue on my arm64 or x86-64 builds, but Stephen Rothwell reports that commit 2accfdb7eff6 ("profiling: attempt to remove per-cpu profile flip buffer") left these static variables around, and the powerpc build is unhappy about them: kernel/profile.c:52:28: warning: 'cpu_profile_flip' defined but not used [-Wunused-variable] 52 | static DEFINE_PER_CPU(int, cpu_profile_flip); | ^~~~~~~~~~~~~~~~ .. So remove these stale left-over remnants too. Fixes: 2accfdb7eff6 ("profiling: attempt to remove per-cpu profile flip buffer") Reported-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- kernel/profile.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 2dfb0f4e755c..ff68d3816182 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -47,12 +47,6 @@ static unsigned short int prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); -#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) -static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); -static DEFINE_PER_CPU(int, cpu_profile_flip); -static DEFINE_MUTEX(profile_flip_mutex); -#endif /* CONFIG_SMP */ - int profile_setup(char *str) { static const char schedstr[] = "schedule"; -- cgit v1.2.3 From 6881e75237a84093d0986f56223db3724619f26e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Jul 2024 12:23:51 +0200 Subject: tick/broadcast: Move per CPU pointer access into the atomic section The recent fix for making the take over of the broadcast timer more reliable retrieves a per CPU pointer in preemptible context. This went unnoticed as compilers hoist the access into the non-preemptible region where the pointer is actually used. But of course it's valid that the compiler keeps it at the place where the code puts it which rightfully triggers: BUG: using smp_processor_id() in preemptible [00000000] code: caller is hotplug_cpu__broadcast_tick_pull+0x1c/0xc0 Move it to the actual usage site which is in a non-preemptible region. Fixes: f7d43dd206e7 ("tick/broadcast: Make takeover of broadcast hrtimer reliable") Reported-by: David Wang <00107082@163.com> Signed-off-by: Thomas Gleixner Tested-by: Yu Liao Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/87ttg56ers.ffs@tglx --- kernel/time/tick-broadcast.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b4843099a8da..ed58eebb4e8f 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1141,7 +1141,6 @@ void tick_broadcast_switch_to_oneshot(void) #ifdef CONFIG_HOTPLUG_CPU void hotplug_cpu__broadcast_tick_pull(int deadcpu) { - struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *bc; unsigned long flags; @@ -1167,6 +1166,8 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) * device to avoid the starvation. */ if (tick_check_broadcast_expired()) { + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask); tick_program_event(td->evtdev->next_event, 1); } -- cgit v1.2.3 From 224fa3552029a3d14bec7acf72ded8171d551b88 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 31 Jul 2024 12:43:21 +0200 Subject: jump_label: Fix the fix, brown paper bags galore Per the example of: !atomic_cmpxchg(&key->enabled, 0, 1) the inverse was written as: atomic_cmpxchg(&key->enabled, 1, 0) except of course, that while !old is only true for old == 0, old is true for everything except old == 0. Fix it to read: atomic_cmpxchg(&key->enabled, 1, 0) == 1 such that only the 1->0 transition returns true and goes on to disable the keys. Fixes: 83ab38ef0a0b ("jump_label: Fix concurrency issues in static_key_slow_dec()") Reported-by: Darrick J. Wong Signed-off-by: Peter Zijlstra (Intel) Tested-by: Darrick J. Wong Link: https://lkml.kernel.org/r/20240731105557.GY33588@noisy.programming.kicks-ass.net --- kernel/jump_label.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 4ad5ed8adf96..6dc76b590703 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -236,7 +236,7 @@ void static_key_disable_cpuslocked(struct static_key *key) } jump_label_lock(); - if (atomic_cmpxchg(&key->enabled, 1, 0)) + if (atomic_cmpxchg(&key->enabled, 1, 0) == 1) jump_label_update(key); jump_label_unlock(); } @@ -289,7 +289,7 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key) return; guard(mutex)(&jump_label_mutex); - if (atomic_cmpxchg(&key->enabled, 1, 0)) + if (atomic_cmpxchg(&key->enabled, 1, 0) == 1) jump_label_update(key); else WARN_ON_ONCE(!static_key_slow_try_dec(key)); -- cgit v1.2.3 From f2655ac2c06a15558e51ed6529de280e1553c86e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 2 Aug 2024 08:46:15 -0700 Subject: clocksource: Fix brown-bag boolean thinko in cs_watchdog_read() The current "nretries > 1 || nretries >= max_retries" check in cs_watchdog_read() will always evaluate to true, and thus pr_warn(), if nretries is greater than 1. The intent is instead to never warn on the first try, but otherwise warn if the successful retry was the last retry. Therefore, change that "||" to "&&". Fixes: db3a34e17433 ("clocksource: Retry clock read if long delays detected") Reported-by: Borislav Petkov Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240802154618.4149953-2-paulmck@kernel.org --- kernel/time/clocksource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d25ba49e313c..d0538a75f4c6 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -246,7 +246,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end); if (wd_delay <= WATCHDOG_MAX_SKEW) { - if (nretries > 1 || nretries >= max_retries) { + if (nretries > 1 && nretries >= max_retries) { pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", smp_processor_id(), watchdog->name, nretries); } -- cgit v1.2.3 From b88f55389ad27f05ed84af9e1026aa64dbfabc9a Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 4 Aug 2024 18:48:10 +0900 Subject: profiling: remove profile=sleep support The kernel sleep profile is no longer working due to a recursive locking bug introduced by commit 42a20f86dc19 ("sched: Add wrapper for get_wchan() to keep task blocked") Booting with the 'profile=sleep' kernel command line option added or executing # echo -n sleep > /sys/kernel/profiling after boot causes the system to lock up. Lockdep reports kthreadd/3 is trying to acquire lock: ffff93ac82e08d58 (&p->pi_lock){....}-{2:2}, at: get_wchan+0x32/0x70 but task is already holding lock: ffff93ac82e08d58 (&p->pi_lock){....}-{2:2}, at: try_to_wake_up+0x53/0x370 with the call trace being lock_acquire+0xc8/0x2f0 get_wchan+0x32/0x70 __update_stats_enqueue_sleeper+0x151/0x430 enqueue_entity+0x4b0/0x520 enqueue_task_fair+0x92/0x6b0 ttwu_do_activate+0x73/0x140 try_to_wake_up+0x213/0x370 swake_up_locked+0x20/0x50 complete+0x2f/0x40 kthread+0xfb/0x180 However, since nobody noticed this regression for more than two years, let's remove 'profile=sleep' support based on the assumption that nobody needs this functionality. Fixes: 42a20f86dc19 ("sched: Add wrapper for get_wchan() to keep task blocked") Cc: stable@vger.kernel.org # v5.16+ Signed-off-by: Tetsuo Handa Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 4 +--- include/linux/profile.h | 1 - kernel/profile.c | 11 +---------- kernel/sched/stats.c | 10 ---------- 4 files changed, 2 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f1384c7b59c9..09126bb8cc9f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4798,11 +4798,9 @@ profile= [KNL] Enable kernel profiling via /proc/profile Format: [,] - Param: : "schedule", "sleep", or "kvm" + Param: : "schedule" or "kvm" [defaults to kernel profiling] Param: "schedule" - profile schedule points. - Param: "sleep" - profile D-state sleeping (millisecs). - Requires CONFIG_SCHEDSTATS Param: "kvm" - profile VM exits. Param: - step/bucket size as a power of 2 for statistical time based profiling. diff --git a/include/linux/profile.h b/include/linux/profile.h index 2fb487f61d12..3f53cdb0c27c 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -10,7 +10,6 @@ #define CPU_PROFILING 1 #define SCHED_PROFILING 2 -#define SLEEP_PROFILING 3 #define KVM_PROFILING 4 struct proc_dir_entry; diff --git a/kernel/profile.c b/kernel/profile.c index ff68d3816182..1fcf1adcf4eb 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -50,20 +50,11 @@ EXPORT_SYMBOL_GPL(prof_on); int profile_setup(char *str) { static const char schedstr[] = "schedule"; - static const char sleepstr[] = "sleep"; static const char kvmstr[] = "kvm"; const char *select = NULL; int par; - if (!strncmp(str, sleepstr, strlen(sleepstr))) { -#ifdef CONFIG_SCHEDSTATS - force_schedstat_enabled(); - prof_on = SLEEP_PROFILING; - select = sleepstr; -#else - pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); -#endif /* CONFIG_SCHEDSTATS */ - } else if (!strncmp(str, schedstr, strlen(schedstr))) { + if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; select = schedstr; } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 78e48f5426ee..eb0cdcd4d921 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -92,16 +92,6 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, trace_sched_stat_blocked(p, delta); - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(p), - delta >> 20); - } account_scheduler_latency(p, delta >> 10, 0); } } -- cgit v1.2.3 From 8c8acb8f26cbde665b233dd1b9bbcbb9b86822dc Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 2 Aug 2024 22:53:15 +0900 Subject: kprobes: Fix to check symbol prefixes correctly Since str_has_prefix() takes the prefix as the 2nd argument and the string as the first, is_cfi_preamble_symbol() always fails to check the prefix. Fix the function parameter order so that it correctly check the prefix. Link: https://lore.kernel.org/all/172260679559.362040.7360872132937227206.stgit@devnote2/ Fixes: de02f2ac5d8c ("kprobes: Prohibit probing on CFI preamble symbol") Signed-off-by: Masami Hiramatsu (Google) --- kernel/kprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e85de37d9e1e..da59c68df841 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1557,8 +1557,8 @@ static bool is_cfi_preamble_symbol(unsigned long addr) if (lookup_symbol_name(addr, symbuf)) return false; - return str_has_prefix("__cfi_", symbuf) || - str_has_prefix("__pfx_", symbuf); + return str_has_prefix(symbuf, "__cfi_") || + str_has_prefix(symbuf, "__pfx_"); } static int check_kprobe_address_safe(struct kprobe *p, -- cgit v1.2.3 From 87d571d6fb77ec342a985afa8744bb9bb75b3622 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Fri, 17 May 2024 20:22:44 +0000 Subject: ntp: Clamp maxerror and esterror to operating range Using syzkaller alongside the newly reintroduced signed integer overflow sanitizer spits out this report: UBSAN: signed-integer-overflow in ../kernel/time/ntp.c:461:16 9223372036854775807 + 500 cannot be represented in type 'long' Call Trace: handle_overflow+0x171/0x1b0 second_overflow+0x2d6/0x500 accumulate_nsecs_to_secs+0x60/0x160 timekeeping_advance+0x1fe/0x890 update_wall_time+0x10/0x30 time_maxerror is unconditionally incremented and the result is checked against NTP_PHASE_LIMIT, but the increment itself can overflow, resulting in wrap-around to negative space. Before commit eea83d896e31 ("ntp: NTP4 user space bits update") the user supplied value was sanity checked to be in the operating range. That change removed the sanity check and relied on clamping in handle_overflow() which does not work correctly when the user supplied value is in the overflow zone of the '+ 500' operation. The operation requires CAP_SYS_TIME and the side effect of the overflow is NTP getting out of sync. Miroslav confirmed that the input value should be clamped to the operating range and the same applies to time_esterror. The latter is not used by the kernel, but the value still should be in the operating range as it was before the sanity check got removed. Clamp them to the operating range. [ tglx: Changed it to clamping and included time_esterror ] Fixes: eea83d896e31 ("ntp: NTP4 user space bits update") Signed-off-by: Justin Stitt Signed-off-by: Thomas Gleixner Cc: Miroslav Lichvar Link: https://lore.kernel.org/all/20240517-b4-sio-ntp-usec-v2-1-d539180f2b79@google.com Closes: https://github.com/KSPP/linux/issues/354 --- kernel/time/ntp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 406dccb79c2b..502e1e5b7f7f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -727,10 +727,10 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, } if (txc->modes & ADJ_MAXERROR) - time_maxerror = txc->maxerror; + time_maxerror = clamp(txc->maxerror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_ESTERROR) - time_esterror = txc->esterror; + time_esterror = clamp(txc->esterror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_TIMECONST) { time_constant = txc->constant; -- cgit v1.2.3 From 06c03c8edce333b9ad9c6b207d93d3a5ae7c10c0 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Fri, 17 May 2024 00:47:10 +0000 Subject: ntp: Safeguard against time_constant overflow Using syzkaller with the recently reintroduced signed integer overflow sanitizer produces this UBSAN report: UBSAN: signed-integer-overflow in ../kernel/time/ntp.c:738:18 9223372036854775806 + 4 cannot be represented in type 'long' Call Trace: handle_overflow+0x171/0x1b0 __do_adjtimex+0x1236/0x1440 do_adjtimex+0x2be/0x740 The user supplied time_constant value is incremented by four and then clamped to the operating range. Before commit eea83d896e31 ("ntp: NTP4 user space bits update") the user supplied value was sanity checked to be in the operating range. That change removed the sanity check and relied on clamping after incrementing which does not work correctly when the user supplied value is in the overflow zone of the '+ 4' operation. The operation requires CAP_SYS_TIME and the side effect of the overflow is NTP getting out of sync. Similar to the fixups for time_maxerror and time_esterror, clamp the user space supplied value to the operating range. [ tglx: Switch to clamping ] Fixes: eea83d896e31 ("ntp: NTP4 user space bits update") Signed-off-by: Justin Stitt Signed-off-by: Thomas Gleixner Cc: Miroslav Lichvar Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240517-b4-sio-ntp-c-v2-1-f3a80096f36f@google.com Closes: https://github.com/KSPP/linux/issues/352 --- kernel/time/ntp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 502e1e5b7f7f..8d2dd214ec68 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -733,11 +733,10 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, time_esterror = clamp(txc->esterror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_TIMECONST) { - time_constant = txc->constant; + time_constant = clamp(txc->constant, 0, MAXTC); if (!(time_status & STA_NANO)) time_constant += 4; - time_constant = min(time_constant, (long)MAXTC); - time_constant = max(time_constant, 0l); + time_constant = clamp(time_constant, 0, MAXTC); } if (txc->modes & ADJ_TAI && -- cgit v1.2.3 From 5916be8a53de6401871bdd953f6c60237b47d6d3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 3 Aug 2024 17:07:51 +0200 Subject: timekeeping: Fix bogus clock_was_set() invocation in do_adjtimex() The addition of the bases argument to clock_was_set() fixed up all call sites correctly except for do_adjtimex(). This uses CLOCK_REALTIME instead of CLOCK_SET_WALL as argument. CLOCK_REALTIME is 0. As a result the effect of that clock_was_set() notification is incomplete and might result in timers expiring late because the hrtimer code does not re-evaluate the affected clock bases. Use CLOCK_SET_WALL instead of CLOCK_REALTIME to tell the hrtimers code which clock bases need to be re-evaluated. Fixes: 17a1b8826b45 ("hrtimer: Add bases argument to clock_was_set()") Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/877ccx7igo.ffs@tglx --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2fa87dcfeda9..5391e4167d60 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2606,7 +2606,7 @@ int do_adjtimex(struct __kernel_timex *txc) clock_set |= timekeeping_advance(TK_ADV_FREQ); if (clock_set) - clock_was_set(CLOCK_REALTIME); + clock_was_set(CLOCK_SET_WALL); ntp_notify_cmos_timer(); -- cgit v1.2.3 From bd44ca3de49cc1badcff7a96010fa2c64f04868c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 6 Aug 2024 11:56:45 -0400 Subject: dma-debug: avoid deadlock between dma debug vs printk and netconsole Currently the dma debugging code can end up indirectly calling printk under the radix_lock. This happens when a radix tree node allocation fails. This is a problem because the printk code, when used together with netconsole, can end up inside the dma debugging code while trying to transmit a message over netcons. This creates the possibility of either a circular deadlock on the same CPU, with that CPU trying to grab the radix_lock twice, or an ABBA deadlock between different CPUs, where one CPU grabs the console lock first and then waits for the radix_lock, while the other CPU is holding the radix_lock and is waiting for the console lock. The trace captured by lockdep is of the ABBA variant. -> #2 (&dma_entry_hash[i].lock){-.-.}-{2:2}: _raw_spin_lock_irqsave+0x5a/0x90 debug_dma_map_page+0x79/0x180 dma_map_page_attrs+0x1d2/0x2f0 bnxt_start_xmit+0x8c6/0x1540 netpoll_start_xmit+0x13f/0x180 netpoll_send_skb+0x20d/0x320 netpoll_send_udp+0x453/0x4a0 write_ext_msg+0x1b9/0x460 console_flush_all+0x2ff/0x5a0 console_unlock+0x55/0x180 vprintk_emit+0x2e3/0x3c0 devkmsg_emit+0x5a/0x80 devkmsg_write+0xfd/0x180 do_iter_readv_writev+0x164/0x1b0 vfs_writev+0xf9/0x2b0 do_writev+0x6d/0x110 do_syscall_64+0x80/0x150 entry_SYSCALL_64_after_hwframe+0x4b/0x53 -> #0 (console_owner){-.-.}-{0:0}: __lock_acquire+0x15d1/0x31a0 lock_acquire+0xe8/0x290 console_flush_all+0x2ea/0x5a0 console_unlock+0x55/0x180 vprintk_emit+0x2e3/0x3c0 _printk+0x59/0x80 warn_alloc+0x122/0x1b0 __alloc_pages_slowpath+0x1101/0x1120 __alloc_pages+0x1eb/0x2c0 alloc_slab_page+0x5f/0x150 new_slab+0x2dc/0x4e0 ___slab_alloc+0xdcb/0x1390 kmem_cache_alloc+0x23d/0x360 radix_tree_node_alloc+0x3c/0xf0 radix_tree_insert+0xf5/0x230 add_dma_entry+0xe9/0x360 dma_map_page_attrs+0x1d2/0x2f0 __bnxt_alloc_rx_frag+0x147/0x180 bnxt_alloc_rx_data+0x79/0x160 bnxt_rx_skb+0x29/0xc0 bnxt_rx_pkt+0xe22/0x1570 __bnxt_poll_work+0x101/0x390 bnxt_poll+0x7e/0x320 __napi_poll+0x29/0x160 net_rx_action+0x1e0/0x3e0 handle_softirqs+0x190/0x510 run_ksoftirqd+0x4e/0x90 smpboot_thread_fn+0x1a8/0x270 kthread+0x102/0x120 ret_from_fork+0x2f/0x40 ret_from_fork_asm+0x11/0x20 This bug is more likely than it seems, because when one CPU has run out of memory, chances are the other has too. The good news is, this bug is hidden behind the CONFIG_DMA_API_DEBUG, so not many users are likely to trigger it. Signed-off-by: Rik van Riel Reported-by: Konstantin Ovsepian Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index a6e3792b15f8..d570535342cb 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -416,8 +416,11 @@ static unsigned long long phys_addr(struct dma_debug_entry *entry) * dma_active_cacheline entry to track per event. dma_map_sg(), on the * other hand, consumes a single dma_debug_entry, but inserts 'nents' * entries into the tree. + * + * Use __GFP_NOWARN because the printk from an OOM, to netconsole, could end + * up right back in the DMA debugging code, leading to a deadlock. */ -static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC); +static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC | __GFP_NOWARN); static DEFINE_SPINLOCK(radix_lock); #define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) #define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT) -- cgit v1.2.3 From ff9bf4b34104955017822e9bc42aeeb526ee2a80 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 30 Jul 2024 21:14:08 -0400 Subject: lockdep: Fix lockdep_set_notrack_class() for CONFIG_LOCK_STAT We won't find a contended lock if it's not being tracked. Signed-off-by: Kent Overstreet --- kernel/locking/lockdep.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 58c88220a478..0349f957e672 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5936,6 +5936,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; + if (unlikely(lock->key == &__lockdep_no_track__)) + return; + hlock = find_held_lock(curr, lock, depth, &i); if (!hlock) { print_lock_contention_bug(curr, lock, ip); @@ -5978,6 +5981,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; + if (unlikely(lock->key == &__lockdep_no_track__)) + return; + hlock = find_held_lock(curr, lock, depth, &i); if (!hlock) { print_lock_contention_bug(curr, lock, _RET_IP_); -- cgit v1.2.3 From edbbaae42a56f9a2b39c52ef2504dfb3fb0a7858 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Aug 2024 10:20:44 +0300 Subject: genirq/irqdesc: Honor caller provided affinity in alloc_desc() Currently, whenever a caller is providing an affinity hint for an interrupt, the allocation code uses it to calculate the node and copies the cpumask into irq_desc::affinity. If the affinity for the interrupt is not marked 'managed' then the startup of the interrupt ignores irq_desc::affinity and uses the system default affinity mask. Prevent this by setting the IRQD_AFFINITY_SET flag for the interrupt in the allocator, which causes irq_setup_affinity() to use irq_desc::affinity on interrupt startup if the mask contains an online CPU. [ tglx: Massaged changelog ] Fixes: 45ddcecbfa94 ("genirq: Use affinity hint in irqdesc allocation") Signed-off-by: Shay Drory Signed-off-by: Thomas Gleixner Cc: Link: https://lore.kernel.org/all/20240806072044.837827-1-shayd@nvidia.com --- kernel/irq/irqdesc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 07e99c936ba5..1dee88ba0ae4 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -530,6 +530,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, flags = IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN; } + flags |= IRQD_AFFINITY_SET; mask = &affinity->mask; node = cpu_to_node(cpumask_first(mask)); affinity++; -- cgit v1.2.3 From b1560408692cd0ab0370cfbe9deb03ce97ab3f6d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 30 Jul 2024 11:06:57 -0400 Subject: tracing: Have format file honor EVENT_FILE_FL_FREED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When eventfs was introduced, special care had to be done to coordinate the freeing of the file meta data with the files that are exposed to user space. The file meta data would have a ref count that is set when the file is created and would be decremented and freed after the last user that opened the file closed it. When the file meta data was to be freed, it would set a flag (EVENT_FILE_FL_FREED) to denote that the file is freed, and any new references made (like new opens or reads) would fail as it is marked freed. This allowed other meta data to be freed after this flag was set (under the event_mutex). All the files that were dynamically created in the events directory had a pointer to the file meta data and would call event_release() when the last reference to the user space file was closed. This would be the time that it is safe to free the file meta data. A shortcut was made for the "format" file. It's i_private would point to the "call" entry directly and not point to the file's meta data. This is because all format files are the same for the same "call", so it was thought there was no reason to differentiate them. The other files maintain state (like the "enable", "trigger", etc). But this meant if the file were to disappear, the "format" file would be unaware of it. This caused a race that could be trigger via the user_events test (that would create dynamic events and free them), and running a loop that would read the user_events format files: In one console run: # cd tools/testing/selftests/user_events # while true; do ./ftrace_test; done And in another console run: # cd /sys/kernel/tracing/ # while true; do cat events/user_events/__test_event/format; done 2>/dev/null With KASAN memory checking, it would trigger a use-after-free bug report (which was a real bug). This was because the format file was not checking the file's meta data flag "EVENT_FILE_FL_FREED", so it would access the event that the file meta data pointed to after the event was freed. After inspection, there are other locations that were found to not check the EVENT_FILE_FL_FREED flag when accessing the trace_event_file. Add a new helper function: event_file_file() that will make sure that the event_mutex is held, and will return NULL if the trace_event_file has the EVENT_FILE_FL_FREED flag set. Have the first reference of the struct file pointer use event_file_file() and check for NULL. Later uses can still use the event_file_data() helper function if the event_mutex is still held and was not released since the event_file_file() call. Link: https://lore.kernel.org/all/20240719204701.1605950-1-minipli@grsecurity.net/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Ajay Kaher Cc: Ilkka Naulapää Cc: Linus Torvalds Cc: Al Viro Cc: Dan Carpenter Cc: Beau Belgrave Cc: Florian Fainelli Cc: Alexey Makhalov Cc: Vasavi Sirnapalli Link: https://lore.kernel.org/20240730110657.3b69d3c1@gandalf.local.home Fixes: b63db58e2fa5d ("eventfs/tracing: Add callback for release of an eventfs_inode") Reported-by: Mathias Krause Tested-by: Mathias Krause Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 23 +++++++++++++++++++++++ kernel/trace/trace_events.c | 33 ++++++++++++++++++++------------- kernel/trace/trace_events_hist.c | 4 ++-- kernel/trace/trace_events_inject.c | 2 +- kernel/trace/trace_events_trigger.c | 6 +++--- 5 files changed, 49 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8783bebd0562..bd3e3069300e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1634,6 +1634,29 @@ static inline void *event_file_data(struct file *filp) extern struct mutex event_mutex; extern struct list_head ftrace_events; +/* + * When the trace_event_file is the filp->i_private pointer, + * it must be taken under the event_mutex lock, and then checked + * if the EVENT_FILE_FL_FREED flag is set. If it is, then the + * data pointed to by the trace_event_file can not be trusted. + * + * Use the event_file_file() to access the trace_event_file from + * the filp the first time under the event_mutex and check for + * NULL. If it is needed to be retrieved again and the event_mutex + * is still held, then the event_file_data() can be used and it + * is guaranteed to be valid. + */ +static inline struct trace_event_file *event_file_file(struct file *filp) +{ + struct trace_event_file *file; + + lockdep_assert_held(&event_mutex); + file = READ_ONCE(file_inode(filp)->i_private); + if (!file || file->flags & EVENT_FILE_FL_FREED) + return NULL; + return file; +} + extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; extern const struct file_operations event_hist_debug_fops; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6ef29eba90ce..f08fbaf8cad6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1386,12 +1386,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, char buf[4] = "0"; mutex_lock(&event_mutex); - file = event_file_data(filp); + file = event_file_file(filp); if (likely(file)) flags = file->flags; mutex_unlock(&event_mutex); - if (!file || flags & EVENT_FILE_FL_FREED) + if (!file) return -ENODEV; if (flags & EVENT_FILE_FL_ENABLED && @@ -1424,8 +1424,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, case 1: ret = -ENODEV; mutex_lock(&event_mutex); - file = event_file_data(filp); - if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) { + file = event_file_file(filp); + if (likely(file)) { ret = tracing_update_buffers(file->tr); if (ret < 0) { mutex_unlock(&event_mutex); @@ -1540,7 +1540,8 @@ enum { static void *f_next(struct seq_file *m, void *v, loff_t *pos) { - struct trace_event_call *call = event_file_data(m->private); + struct trace_event_file *file = event_file_data(m->private); + struct trace_event_call *call = file->event_call; struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); struct list_head *node = v; @@ -1572,7 +1573,8 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) static int f_show(struct seq_file *m, void *v) { - struct trace_event_call *call = event_file_data(m->private); + struct trace_event_file *file = event_file_data(m->private); + struct trace_event_call *call = file->event_call; struct ftrace_event_field *field; const char *array_descriptor; @@ -1627,12 +1629,14 @@ static int f_show(struct seq_file *m, void *v) static void *f_start(struct seq_file *m, loff_t *pos) { + struct trace_event_file *file; void *p = (void *)FORMAT_HEADER; loff_t l = 0; /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); - if (!event_file_data(m->private)) + file = event_file_file(m->private); + if (!file) return ERR_PTR(-ENODEV); while (l < *pos && p) @@ -1706,8 +1710,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); mutex_lock(&event_mutex); - file = event_file_data(filp); - if (file && !(file->flags & EVENT_FILE_FL_FREED)) + file = event_file_file(filp); + if (file) print_event_filter(file, s); mutex_unlock(&event_mutex); @@ -1736,9 +1740,13 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return PTR_ERR(buf); mutex_lock(&event_mutex); - file = event_file_data(filp); - if (file) - err = apply_event_filter(file, buf); + file = event_file_file(filp); + if (file) { + if (file->flags & EVENT_FILE_FL_FREED) + err = -ENODEV; + else + err = apply_event_filter(file, buf); + } mutex_unlock(&event_mutex); kfree(buf); @@ -2485,7 +2493,6 @@ static int event_callback(const char *name, umode_t *mode, void **data, if (strcmp(name, "format") == 0) { *mode = TRACE_MODE_READ; *fops = &ftrace_event_format_fops; - *data = call; return 1; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6ece1308d36a..5f9119eb7c67 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5601,7 +5601,7 @@ static int hist_show(struct seq_file *m, void *v) mutex_lock(&event_mutex); - event_file = event_file_data(m->private); + event_file = event_file_file(m->private); if (unlikely(!event_file)) { ret = -ENODEV; goto out_unlock; @@ -5880,7 +5880,7 @@ static int hist_debug_show(struct seq_file *m, void *v) mutex_lock(&event_mutex); - event_file = event_file_data(m->private); + event_file = event_file_file(m->private); if (unlikely(!event_file)) { ret = -ENODEV; goto out_unlock; diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index 8650562bdaa9..a8f076809db4 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -299,7 +299,7 @@ event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt, strim(buf); mutex_lock(&event_mutex); - file = event_file_data(filp); + file = event_file_file(filp); if (file) { call = file->event_call; size = parse_entry(buf, call, &entry); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 4bec043c8690..a5e3d6acf1e1 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -159,7 +159,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos) /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); - event_file = event_file_data(m->private); + event_file = event_file_file(m->private); if (unlikely(!event_file)) return ERR_PTR(-ENODEV); @@ -213,7 +213,7 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) mutex_lock(&event_mutex); - if (unlikely(!event_file_data(file))) { + if (unlikely(!event_file_file(file))) { mutex_unlock(&event_mutex); return -ENODEV; } @@ -293,7 +293,7 @@ static ssize_t event_trigger_regex_write(struct file *file, strim(buf); mutex_lock(&event_mutex); - event_file = event_file_data(file); + event_file = event_file_file(file); if (unlikely(!event_file)) { mutex_unlock(&event_mutex); kfree(buf); -- cgit v1.2.3 From 6e2fdceffdc6bd7b8ba314a1d1b976721533c8f9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 26 Jul 2024 14:42:08 -0400 Subject: tracing: Use refcount for trace_event_file reference counter Instead of using an atomic counter for the trace_event_file reference counter, use the refcount interface. It has various checks to make sure the reference counting is correct, and will warn if it detects an error (like refcount_inc() on '0'). Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20240726144208.687cce24@rorschach.local.home Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 2 +- kernel/trace/trace_events.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 9df3e2973626..fed58e54f15e 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -680,7 +680,7 @@ struct trace_event_file { * caching and such. Which is mostly OK ;-) */ unsigned long flags; - atomic_t ref; /* ref count for opened files */ + refcount_t ref; /* ref count for opened files */ atomic_t sm_ref; /* soft-mode reference counter */ atomic_t tm_ref; /* trigger-mode reference counter */ }; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f08fbaf8cad6..7266ec2a4eea 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -992,18 +992,18 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) void event_file_get(struct trace_event_file *file) { - atomic_inc(&file->ref); + refcount_inc(&file->ref); } void event_file_put(struct trace_event_file *file) { - if (WARN_ON_ONCE(!atomic_read(&file->ref))) { + if (WARN_ON_ONCE(!refcount_read(&file->ref))) { if (file->flags & EVENT_FILE_FL_FREED) kmem_cache_free(file_cachep, file); return; } - if (atomic_dec_and_test(&file->ref)) { + if (refcount_dec_and_test(&file->ref)) { /* Count should only go to zero when it is freed */ if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED))) return; @@ -3003,7 +3003,7 @@ trace_create_new_event(struct trace_event_call *call, atomic_set(&file->tm_ref, 0); INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); - event_file_get(file); + refcount_set(&file->ref, 1); return file; } -- cgit v1.2.3 From 604b72b32522d548f855ed82842d2e49bf384edb Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Sat, 3 Aug 2024 15:09:26 +0200 Subject: function_graph: Fix the ret_stack used by ftrace_graph_ret_addr() When ftrace_graph_ret_addr() is invoked to convert a found stack return address to its original value, the function can end up producing the following crash: [ 95.442712] BUG: kernel NULL pointer dereference, address: 0000000000000028 [ 95.442720] #PF: supervisor read access in kernel mode [ 95.442724] #PF: error_code(0x0000) - not-present page [ 95.442727] PGD 0 P4D 0- [ 95.442731] Oops: Oops: 0000 [#1] PREEMPT SMP PTI [ 95.442736] CPU: 1 UID: 0 PID: 2214 Comm: insmod Kdump: loaded Tainted: G OE K 6.11.0-rc1-default #1 67c62a3b3720562f7e7db5f11c1fdb40b7a2857c [ 95.442747] Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE, [K]=LIVEPATCH [ 95.442750] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014 [ 95.442754] RIP: 0010:ftrace_graph_ret_addr+0x42/0xc0 [ 95.442766] Code: [...] [ 95.442773] RSP: 0018:ffff979b80ff7718 EFLAGS: 00010006 [ 95.442776] RAX: ffffffff8ca99b10 RBX: ffff979b80ff7760 RCX: ffff979b80167dc0 [ 95.442780] RDX: ffffffff8ca99b10 RSI: ffff979b80ff7790 RDI: 0000000000000005 [ 95.442783] RBP: 0000000000000001 R08: 0000000000000005 R09: 0000000000000000 [ 95.442786] R10: 0000000000000005 R11: 0000000000000000 R12: ffffffff8e9491e0 [ 95.442790] R13: ffffffff8d6f70f0 R14: ffff979b80167da8 R15: ffff979b80167dc8 [ 95.442793] FS: 00007fbf83895740(0000) GS:ffff8a0afdd00000(0000) knlGS:0000000000000000 [ 95.442797] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 95.442800] CR2: 0000000000000028 CR3: 0000000005070002 CR4: 0000000000370ef0 [ 95.442806] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 95.442809] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 95.442816] Call Trace: [ 95.442823] [ 95.442896] unwind_next_frame+0x20d/0x830 [ 95.442905] arch_stack_walk_reliable+0x94/0xe0 [ 95.442917] stack_trace_save_tsk_reliable+0x7d/0xe0 [ 95.442922] klp_check_and_switch_task+0x55/0x1a0 [ 95.442931] task_call_func+0xd3/0xe0 [ 95.442938] klp_try_switch_task.part.5+0x37/0x150 [ 95.442942] klp_try_complete_transition+0x79/0x2d0 [ 95.442947] klp_enable_patch+0x4db/0x890 [ 95.442960] do_one_initcall+0x41/0x2e0 [ 95.442968] do_init_module+0x60/0x220 [ 95.442975] load_module+0x1ebf/0x1fb0 [ 95.443004] init_module_from_file+0x88/0xc0 [ 95.443010] idempotent_init_module+0x190/0x240 [ 95.443015] __x64_sys_finit_module+0x5b/0xc0 [ 95.443019] do_syscall_64+0x74/0x160 [ 95.443232] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 95.443236] RIP: 0033:0x7fbf82f2c709 [ 95.443241] Code: [...] [ 95.443247] RSP: 002b:00007fffd5ea3b88 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 95.443253] RAX: ffffffffffffffda RBX: 000056359c48e750 RCX: 00007fbf82f2c709 [ 95.443257] RDX: 0000000000000000 RSI: 000056356ed4efc5 RDI: 0000000000000003 [ 95.443260] RBP: 000056356ed4efc5 R08: 0000000000000000 R09: 00007fffd5ea3c10 [ 95.443263] R10: 0000000000000003 R11: 0000000000000246 R12: 0000000000000000 [ 95.443267] R13: 000056359c48e6f0 R14: 0000000000000000 R15: 0000000000000000 [ 95.443272] [ 95.443274] Modules linked in: [...] [ 95.443385] Unloaded tainted modules: intel_uncore_frequency(E):1 isst_if_common(E):1 skx_edac(E):1 [ 95.443414] CR2: 0000000000000028 The bug can be reproduced with kselftests: cd linux/tools/testing/selftests make TARGETS='ftrace livepatch' (cd ftrace; ./ftracetest test.d/ftrace/fgraph-filter.tc) (cd livepatch; ./test-livepatch.sh) The problem is that ftrace_graph_ret_addr() is supposed to operate on the ret_stack of a selected task but wrongly accesses the ret_stack of the current task. Specifically, the above NULL dereference occurs when task->curr_ret_stack is non-zero, but current->ret_stack is NULL. Correct ftrace_graph_ret_addr() to work with the right ret_stack. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Reported-by: Miroslav Benes Link: https://lore.kernel.org/20240803131211.17255-1-petr.pavlu@suse.com Fixes: 7aa1eaef9f42 ("function_graph: Allow multiple users to attach to function graph") Signed-off-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index fc205ad167a9..d1d5ea2d0a1b 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -902,7 +902,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, i = *idx ? : task->curr_ret_stack; while (i > 0) { - ret_stack = get_ret_stack(current, i, &i); + ret_stack = get_ret_stack(task, i, &i); if (!ret_stack) break; /* -- cgit v1.2.3 From bcf86c01ca4676316557dd482c8416ece8c2e143 Mon Sep 17 00:00:00 2001 From: Tze-nan Wu Date: Mon, 5 Aug 2024 13:59:22 +0800 Subject: tracing: Fix overflow in get_free_elt() "tracing_map->next_elt" in get_free_elt() is at risk of overflowing. Once it overflows, new elements can still be inserted into the tracing_map even though the maximum number of elements (`max_elts`) has been reached. Continuing to insert elements after the overflow could result in the tracing_map containing "tracing_map->max_size" elements, leaving no empty entries. If any attempt is made to insert an element into a full tracing_map using `__tracing_map_insert()`, it will cause an infinite loop with preemption disabled, leading to a CPU hang problem. Fix this by preventing any further increments to "tracing_map->next_elt" once it reaches "tracing_map->max_elt". Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Fixes: 08d43a5fa063e ("tracing: Add lock-free tracing_map") Co-developed-by: Cheng-Jui Wang Link: https://lore.kernel.org/20240805055922.6277-1-Tze-nan.Wu@mediatek.com Signed-off-by: Cheng-Jui Wang Signed-off-by: Tze-nan Wu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/tracing_map.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index a4dcf0f24352..3a56e7c8aa4f 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -454,7 +454,7 @@ static struct tracing_map_elt *get_free_elt(struct tracing_map *map) struct tracing_map_elt *elt = NULL; int idx; - idx = atomic_inc_return(&map->next_elt); + idx = atomic_fetch_add_unless(&map->next_elt, 1, map->max_elts); if (idx < map->max_elts) { elt = *(TRACING_MAP_ELT(map->elts, idx)); if (map->ops && map->ops->elt_init) @@ -699,7 +699,7 @@ void tracing_map_clear(struct tracing_map *map) { unsigned int i; - atomic_set(&map->next_elt, -1); + atomic_set(&map->next_elt, 0); atomic64_set(&map->hits, 0); atomic64_set(&map->drops, 0); @@ -783,7 +783,7 @@ struct tracing_map *tracing_map_create(unsigned int map_bits, map->map_bits = map_bits; map->max_elts = (1 << map_bits); - atomic_set(&map->next_elt, -1); + atomic_set(&map->next_elt, 0); map->map_size = (1 << (map_bits + 1)); map->ops = ops; -- cgit v1.2.3 From 58f7e4d7ba32758b861807e77535853cacc1f426 Mon Sep 17 00:00:00 2001 From: Jianhui Zhou <912460177@qq.com> Date: Mon, 5 Aug 2024 19:36:31 +0800 Subject: ring-buffer: Remove unused function ring_buffer_nr_pages() Because ring_buffer_nr_pages() is not an inline function and user accesses buffer->buffers[cpu]->nr_pages directly, the function ring_buffer_nr_pages is removed. Signed-off-by: Jianhui Zhou <912460177@qq.com> Link: https://lore.kernel.org/tencent_F4A7E9AB337F44E0F4B858D07D19EF460708@qq.com Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 - kernel/trace/ring_buffer.c | 12 ------------ 2 files changed, 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 96d2140b471e..fd35d4ec12e1 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -193,7 +193,6 @@ void ring_buffer_set_clock(struct trace_buffer *buffer, void ring_buffer_set_time_stamp_abs(struct trace_buffer *buffer, bool abs); bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer); -size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu); size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu); struct buffer_data_read_page; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 28853966aa9a..cebd879a30cb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -692,18 +692,6 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, return ts; } -/** - * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer - * @buffer: The ring_buffer to get the number of pages from - * @cpu: The cpu of the ring_buffer to get the number of pages from - * - * Returns the number of pages used by a per_cpu buffer of the ring buffer. - */ -size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu) -{ - return buffer->buffers[cpu]->nr_pages; -} - /** * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from -- cgit v1.2.3 From 7d4df2dad312f270d62fecb0e5c8b086c6d7dcfc Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 29 Jul 2024 04:21:58 +0200 Subject: kcov: properly check for softirq context When collecting coverage from softirqs, KCOV uses in_serving_softirq() to check whether the code is running in the softirq context. Unfortunately, in_serving_softirq() is > 0 even when the code is running in the hardirq or NMI context for hardirqs and NMIs that happened during a softirq. As a result, if a softirq handler contains a remote coverage collection section and a hardirq with another remote coverage collection section happens during handling the softirq, KCOV incorrectly detects a nested softirq coverate collection section and prints a WARNING, as reported by syzbot. This issue was exposed by commit a7f3813e589f ("usb: gadget: dummy_hcd: Switch to hrtimer transfer scheduler"), which switched dummy_hcd to using hrtimer and made the timer's callback be executed in the hardirq context. Change the related checks in KCOV to account for this behavior of in_serving_softirq() and make KCOV ignore remote coverage collection sections in the hardirq and NMI contexts. This prevents the WARNING printed by syzbot but does not fix the inability of KCOV to collect coverage from the __usb_hcd_giveback_urb when dummy_hcd is in use (caused by a7f3813e589f); a separate patch is required for that. Link: https://lkml.kernel.org/r/20240729022158.92059-1-andrey.konovalov@linux.dev Fixes: 5ff3b30ab57d ("kcov: collect coverage from interrupts") Signed-off-by: Andrey Konovalov Reported-by: syzbot+2388cdaeb6b10f0c13ac@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2388cdaeb6b10f0c13ac Acked-by: Marco Elver Cc: Alan Stern Cc: Aleksandr Nogikh Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Greg Kroah-Hartman Cc: Marcello Sylvester Bauer Cc: Signed-off-by: Andrew Morton --- kernel/kcov.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index f0a69d402066..274b6b7c718d 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -161,6 +161,15 @@ static void kcov_remote_area_put(struct kcov_remote_area *area, kmsan_unpoison_memory(&area->list, sizeof(area->list)); } +/* + * Unlike in_serving_softirq(), this function returns false when called during + * a hardirq or an NMI that happened in the softirq context. + */ +static inline bool in_softirq_really(void) +{ + return in_serving_softirq() && !in_hardirq() && !in_nmi(); +} + static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) { unsigned int mode; @@ -170,7 +179,7 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru * so we ignore code executed in interrupts, unless we are in a remote * coverage collection section in a softirq. */ - if (!in_task() && !(in_serving_softirq() && t->kcov_softirq)) + if (!in_task() && !(in_softirq_really() && t->kcov_softirq)) return false; mode = READ_ONCE(t->kcov_mode); /* @@ -849,7 +858,7 @@ void kcov_remote_start(u64 handle) if (WARN_ON(!kcov_check_handle(handle, true, true, true))) return; - if (!in_task() && !in_serving_softirq()) + if (!in_task() && !in_softirq_really()) return; local_lock_irqsave(&kcov_percpu_data.lock, flags); @@ -991,7 +1000,7 @@ void kcov_remote_stop(void) int sequence; unsigned long flags; - if (!in_task() && !in_serving_softirq()) + if (!in_task() && !in_softirq_really()) return; local_lock_irqsave(&kcov_percpu_data.lock, flags); -- cgit v1.2.3 From 6d45e1c948a8b7ed6ceddb14319af69424db730c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 6 Aug 2024 13:46:47 -0400 Subject: padata: Fix possible divide-by-0 panic in padata_mt_helper() We are hit with a not easily reproducible divide-by-0 panic in padata.c at bootup time. [ 10.017908] Oops: divide error: 0000 1 PREEMPT SMP NOPTI [ 10.017908] CPU: 26 PID: 2627 Comm: kworker/u1666:1 Not tainted 6.10.0-15.el10.x86_64 #1 [ 10.017908] Hardware name: Lenovo ThinkSystem SR950 [7X12CTO1WW]/[7X12CTO1WW], BIOS [PSE140J-2.30] 07/20/2021 [ 10.017908] Workqueue: events_unbound padata_mt_helper [ 10.017908] RIP: 0010:padata_mt_helper+0x39/0xb0 : [ 10.017963] Call Trace: [ 10.017968] [ 10.018004] ? padata_mt_helper+0x39/0xb0 [ 10.018084] process_one_work+0x174/0x330 [ 10.018093] worker_thread+0x266/0x3a0 [ 10.018111] kthread+0xcf/0x100 [ 10.018124] ret_from_fork+0x31/0x50 [ 10.018138] ret_from_fork_asm+0x1a/0x30 [ 10.018147] Looking at the padata_mt_helper() function, the only way a divide-by-0 panic can happen is when ps->chunk_size is 0. The way that chunk_size is initialized in padata_do_multithreaded(), chunk_size can be 0 when the min_chunk in the passed-in padata_mt_job structure is 0. Fix this divide-by-0 panic by making sure that chunk_size will be at least 1 no matter what the input parameters are. Link: https://lkml.kernel.org/r/20240806174647.1050398-1-longman@redhat.com Fixes: 004ed42638f4 ("padata: add basic support for multithreaded jobs") Signed-off-by: Waiman Long Cc: Daniel Jordan Cc: Steffen Klassert Cc: Waiman Long Cc: Signed-off-by: Andrew Morton --- kernel/padata.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 53f4bc912712..0fa6c2895460 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -517,6 +517,13 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) ps.chunk_size = max(ps.chunk_size, job->min_chunk); ps.chunk_size = roundup(ps.chunk_size, job->align); + /* + * chunk_size can be 0 if the caller sets min_chunk to 0. So force it + * to at least 1 to prevent divide-by-0 panic in padata_mt_helper().` + */ + if (!ps.chunk_size) + ps.chunk_size = 1U; + list_for_each_entry(pw, &works, pw_list) if (job->numa_aware) { int old_node = atomic_read(&last_used_nid); -- cgit v1.2.3 From cb5b81bc9a448f8db817566f60f92e2ea788ea0f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 8 Aug 2024 12:29:40 -0700 Subject: module: warn about excessively long module waits Russell King reported that the arm cbc(aes) crypto module hangs when loaded, and Herbert Xu bisected it to commit 9b9879fc0327 ("modules: catch concurrent module loads, treat them as idempotent"), and noted: "So what's happening here is that the first modprobe tries to load a fallback CBC implementation, in doing so it triggers a load of the exact same module due to module aliases. IOW we're loading aes-arm-bs which provides cbc(aes). However, this needs a fallback of cbc(aes) to operate, which is made out of the generic cbc module + any implementation of aes, or ecb(aes). The latter happens to also be provided by aes-arm-cb so that's why it tries to load the same module again" So loading the aes-arm-bs module ends up wanting to recursively load itself, and the recursive load then ends up waiting for the original module load to complete. This is a regression, in that it used to be that we just tried to load the module multiple times, and then as we went on to install it the second time we would instead just error out because the module name already existed. That is actually also exactly what the original "catch concurrent loads" patch did in commit 9828ed3f695a ("module: error out early on concurrent load of the same module file"), but it turns out that it ends up being racy, in that erroring out before the module has been fully initialized will cause failures in dependent module loading. See commit ac2263b588df (which was the revert of that "error out early") commit for details about why erroring out before the module has been initialized is actually fundamentally racy. Now, for the actual recursive module load (as opposed to just concurrently loading the same module twice), the race is not an issue. At the same time it's hard for the kernel to see that this is recursion, because the module load is always done from a usermode helper, so the recursion is not some simple callchain within the kernel. End result: this is not the real fix, but this at least adds a warning for the situation (admittedly much too late for all the debugging pain that Russell and Herbert went through) and if we can come to a resolution on how to detect the recursion properly, this re-organizes the code to make that easier. Link: https://lore.kernel.org/all/ZrFHLqvFqhzykuYw@shell.armlinux.org.uk/ Reported-by: Russell King Debugged-by: Herbert Xu Signed-off-by: Linus Torvalds --- kernel/module/main.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index d9592195c5bb..6f4ec857bdef 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3183,15 +3183,28 @@ static int idempotent_init_module(struct file *f, const char __user * uargs, int if (!f || !(f->f_mode & FMODE_READ)) return -EBADF; - /* See if somebody else is doing the operation? */ - if (idempotent(&idem, file_inode(f))) { - wait_for_completion(&idem.complete); - return idem.ret; + /* Are we the winners of the race and get to do this? */ + if (!idempotent(&idem, file_inode(f))) { + int ret = init_module_from_file(f, uargs, flags); + return idempotent_complete(&idem, ret); } - /* Otherwise, we'll do it and complete others */ - return idempotent_complete(&idem, - init_module_from_file(f, uargs, flags)); + /* + * Somebody else won the race and is loading the module. + * + * We have to wait for it forever, since our 'idem' is + * on the stack and the list entry stays there until + * completed (but we could fix it under the idem_lock) + * + * It's also unclear what a real timeout might be, + * but we could maybe at least make this killable + * and remove the idem entry in that case? + */ + for (;;) { + if (wait_for_completion_timeout(&idem.complete, 10*HZ)) + return idem.ret; + pr_warn_once("module '%pD' taking a long time to load", f); + } } SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) -- cgit v1.2.3 From 2124d84db293ba164059077944e6b429ba530495 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 9 Aug 2024 08:33:28 -0700 Subject: module: make waiting for a concurrent module loader interruptible The recursive aes-arm-bs module load situation reported by Russell King is getting fixed in the crypto layer, but this in the meantime fixes the "recursive load hangs forever" by just making the waiting for the first module load be interruptible. This should now match the old behavior before commit 9b9879fc0327 ("modules: catch concurrent module loads, treat them as idempotent"), which used the different "wait for module to be ready" code in module_patient_check_exists(). End result: a recursive module load will still block, but now a signal will interrupt it and fail the second module load, at which point the first module will successfully complete loading. Fixes: 9b9879fc0327 ("modules: catch concurrent module loads, treat them as idempotent") Cc: Russell King Cc: Herbert Xu Signed-off-by: Linus Torvalds --- kernel/module/main.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 6f4ec857bdef..71396e297499 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3104,7 +3104,7 @@ static bool idempotent(struct idempotent *u, const void *cookie) struct idempotent *existing; bool first; - u->ret = 0; + u->ret = -EINTR; u->cookie = cookie; init_completion(&u->complete); @@ -3140,7 +3140,7 @@ static int idempotent_complete(struct idempotent *u, int ret) hlist_for_each_entry_safe(pos, next, head, entry) { if (pos->cookie != cookie) continue; - hlist_del(&pos->entry); + hlist_del_init(&pos->entry); pos->ret = ret; complete(&pos->complete); } @@ -3148,6 +3148,28 @@ static int idempotent_complete(struct idempotent *u, int ret) return ret; } +/* + * Wait for the idempotent worker. + * + * If we get interrupted, we need to remove ourselves from the + * the idempotent list, and the completion may still come in. + * + * The 'idem_lock' protects against the race, and 'idem.ret' was + * initialized to -EINTR and is thus always the right return + * value even if the idempotent work then completes between + * the wait_for_completion and the cleanup. + */ +static int idempotent_wait_for_completion(struct idempotent *u) +{ + if (wait_for_completion_interruptible(&u->complete)) { + spin_lock(&idem_lock); + if (!hlist_unhashed(&u->entry)) + hlist_del(&u->entry); + spin_unlock(&idem_lock); + } + return u->ret; +} + static int init_module_from_file(struct file *f, const char __user * uargs, int flags) { struct load_info info = { }; @@ -3191,20 +3213,8 @@ static int idempotent_init_module(struct file *f, const char __user * uargs, int /* * Somebody else won the race and is loading the module. - * - * We have to wait for it forever, since our 'idem' is - * on the stack and the list entry stays there until - * completed (but we could fix it under the idem_lock) - * - * It's also unclear what a real timeout might be, - * but we could maybe at least make this killable - * and remove the idem entry in that case? */ - for (;;) { - if (wait_for_completion_timeout(&idem.complete, 10*HZ)) - return idem.ret; - pr_warn_once("module '%pD' taking a long time to load", f); - } + return idempotent_wait_for_completion(&idem); } SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) -- cgit v1.2.3