From a9a1d6ad668f2ea0b5a77d0c4c7a41446d0801a8 Mon Sep 17 00:00:00 2001 From: Dongmin Lee Date: Sat, 4 Nov 2023 20:33:20 +0900 Subject: kernel/reboot: explicitly notify if halt occurred instead of power off When kernel_can_power_off() returns false, and reboot has called with LINUX_REBOOT_CMD_POWER_OFF, kernel_halt() will be initiated instead of actual power off function. However, in this situation, Kernel never explicitly notifies user that system halted instead of requested power off. Since halt and power off perform different behavior, and user initiated reboot call with power off command, not halt, This could be unintended behavior to user, like this: ~ # poweroff -f [ 3.581482] reboot: System halted Therefore, this explicitly notifies user that poweroff is not available, and halting has been occured as an alternative behavior instead: ~ # poweroff -f [ 4.123668] reboot: Power off not available: System halted instead [akpm@linux-foundation.org: tweak comment text] Link: https://lkml.kernel.org/r/20231104113320.72440-1-ldmldm05@gmail.com Signed-off-by: Dongmin Lee Signed-off-by: Andrew Morton --- kernel/reboot.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 395a0ea3c7a8..c3a3b82c4f64 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -58,6 +58,14 @@ struct sys_off_handler { struct device *dev; }; +/* + * This variable is used to indicate if a halt was initiated instead of a + * reboot when the reboot call was invoked with LINUX_REBOOT_CMD_POWER_OFF, but + * the system cannot be powered off. This allowes kernel_halt() to notify users + * of that. + */ +static bool poweroff_fallback_to_halt; + /* * Temporary stub that prevents linkage failure while we're in process * of removing all uses of legacy pm_power_off() around the kernel. @@ -297,7 +305,10 @@ void kernel_halt(void) kernel_shutdown_prepare(SYSTEM_HALT); migrate_to_reboot_cpu(); syscore_shutdown(); - pr_emerg("System halted\n"); + if (poweroff_fallback_to_halt) + pr_emerg("Power off not available: System halted instead\n"); + else + pr_emerg("System halted\n"); kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_halt(); } @@ -732,8 +743,10 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) { + poweroff_fallback_to_halt = true; cmd = LINUX_REBOOT_CMD_HALT; + } mutex_lock(&system_transition_mutex); switch (cmd) { -- cgit v1.2.3 From 61a7a5e25fe79b6c43f1c49705a0294be113c4a5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 30 Oct 2023 16:57:10 +0100 Subject: introduce for_other_threads(p, t) Cosmetic, but imho it makes the usage look more clear and simple, the new helper doesn't require to initialize "t". After this change while_each_thread() has only 3 users, and it is only used in the do/while loops. Link: https://lkml.kernel.org/r/20231030155710.GA9095@redhat.com Signed-off-by: Oleg Nesterov Reviewed-by: Christian Brauner Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- fs/exec.c | 3 +-- include/linux/sched/signal.h | 3 +++ kernel/signal.c | 11 ++++------- 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 4aa19b24f281..ee43597cb453 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1578,11 +1578,10 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... */ - t = p; n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); - while_each_thread(p, t) { + for_other_threads(p, t) { if (t->fs == p->fs) n_fs++; } diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3499c1a8b929..41d6759d6a4a 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -646,6 +646,9 @@ extern bool current_is_single_threaded(void); #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) +#define for_other_threads(p, t) \ + for (t = p; (t = next_thread(t)) != p; ) + #define __for_each_thread(signal, t) \ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \ lockdep_is_held(&tasklist_lock)) diff --git a/kernel/signal.c b/kernel/signal.c index 47a7602dfe8d..5aa216e841a2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1376,12 +1376,12 @@ int force_sig_info(struct kernel_siginfo *info) */ int zap_other_threads(struct task_struct *p) { - struct task_struct *t = p; + struct task_struct *t; int count = 0; p->signal->group_stop_count = 0; - while_each_thread(p, t) { + for_other_threads(p, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); /* Don't require de_thread to wait for the vhost_worker */ if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER) @@ -2465,12 +2465,10 @@ static bool do_signal_stop(int signr) sig->group_exit_code = signr; sig->group_stop_count = 0; - if (task_set_jobctl_pending(current, signr | gstop)) sig->group_stop_count++; - t = current; - while_each_thread(current, t) { + for_other_threads(current, t) { /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, @@ -2966,8 +2964,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) if (sigisemptyset(&retarget)) return; - t = tsk; - while_each_thread(tsk, t) { + for_other_threads(tsk, t) { if (t->flags & PF_EXITING) continue; -- cgit v1.2.3 From f72709ab69430d986dfc5a08c9a86f625e3fed33 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 16 Nov 2023 14:36:36 +0100 Subject: arch: remove ARCH_THREAD_STACK_ALLOCATOR Patch series "Remove unused code after IA-64 removal". While looking into something different I noticed that there are a couple of Kconfig options which were only selected by IA-64 and which are now unused. So remove them and simplify the code a bit. This patch (of 3): IA-64 was the only architecture which selected ARCH_THREAD_STACK_ALLOCATOR. IA-64 was removed with commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"). Therefore remove support for ARCH_THREAD_STACK_ALLOCATOR as well. Link: https://lkml.kernel.org/r/20231116133638.1636277-1-hca@linux.ibm.com Link: https://lkml.kernel.org/r/20231116133638.1636277-2-hca@linux.ibm.com Signed-off-by: Heiko Carstens Reviewed-by: Arnd Bergmann Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/Kconfig | 4 ---- kernel/fork.c | 20 -------------------- 2 files changed, 24 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index f4b210ab0612..310162b41a1c 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -320,10 +320,6 @@ config HAVE_ARCH_THREAD_STRUCT_WHITELIST should be implemented. Without this, the entire thread_struct field in task_struct will be left whitelisted. -# Select if arch has its private alloc_thread_stack() function -config ARCH_THREAD_STACK_ALLOCATOR - bool - # Select if arch wants to size task_struct dynamically via arch_task_struct_size: config ARCH_WANTS_DYNAMIC_TASK_STRUCT bool diff --git a/kernel/fork.c b/kernel/fork.c index 10917c3e1f03..d071809866e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -179,8 +179,6 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR - /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. @@ -412,24 +410,6 @@ void thread_stack_cache_init(void) } # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ -#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ - -static int alloc_thread_stack_node(struct task_struct *tsk, int node) -{ - unsigned long *stack; - - stack = arch_alloc_thread_stack_node(tsk, node); - tsk->stack = stack; - return stack ? 0 : -ENOMEM; -} - -static void free_thread_stack(struct task_struct *tsk) -{ - arch_free_thread_stack(tsk); - tsk->stack = NULL; -} - -#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; -- cgit v1.2.3 From 3888750e21ccb909051c810cc79fcc0650a740f8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 16 Nov 2023 14:36:37 +0100 Subject: arch: remove ARCH_TASK_STRUCT_ALLOCATOR IA-64 was the only architecture which selected ARCH_TASK_STRUCT_ALLOCATOR. IA-64 was removed with commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"). Therefore remove support for ARCH_THREAD_STACK_ALLOCATOR as well. Link: https://lkml.kernel.org/r/20231116133638.1636277-3-hca@linux.ibm.com Signed-off-by: Heiko Carstens Reviewed-by: Arnd Bergmann Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/Kconfig | 5 ----- kernel/fork.c | 6 ------ 2 files changed, 11 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 310162b41a1c..c2f87ef9f0ae 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -305,13 +305,8 @@ config ARCH_HAS_CPU_FINALIZE_INIT config ARCH_TASK_STRUCT_ON_STACK bool -# Select if arch has its private alloc_task_struct() function -config ARCH_TASK_STRUCT_ALLOCATOR - bool - config HAVE_ARCH_THREAD_STRUCT_WHITELIST bool - depends on !ARCH_TASK_STRUCT_ALLOCATOR help An architecture should select this to provide hardened usercopy knowledge about what region of the thread_struct should be diff --git a/kernel/fork.c b/kernel/fork.c index d071809866e0..ce8a4b8c04e2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -165,7 +165,6 @@ void __weak arch_release_task_struct(struct task_struct *tsk) { } -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static struct kmem_cache *task_struct_cachep; static inline struct task_struct *alloc_task_struct_node(int node) @@ -177,7 +176,6 @@ static inline void free_task_struct(struct task_struct *tsk) { kmem_cache_free(task_struct_cachep, tsk); } -#endif /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a @@ -1001,7 +999,6 @@ static void set_max_threads(unsigned int max_threads_suggested) int arch_task_struct_size __read_mostly; #endif -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static void task_struct_whitelist(unsigned long *offset, unsigned long *size) { /* Fetch thread_struct whitelist for the architecture. */ @@ -1016,12 +1013,10 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size) else *offset += offsetof(struct task_struct, thread); } -#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */ void __init fork_init(void) { int i; -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN 0 #endif @@ -1034,7 +1029,6 @@ void __init fork_init(void) arch_task_struct_size, align, SLAB_PANIC|SLAB_ACCOUNT, useroffset, usersize, NULL); -#endif /* do the arch specific task caches init */ arch_task_cache_init(); -- cgit v1.2.3 From b454ec29225cda9ae85ed0a154f4228f1922c872 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 20 Nov 2023 16:16:49 +0100 Subject: kernel/signal.c: simplify force_sig_info_to_task(), kill recalc_sigpending_and_wake() The purpose of recalc_sigpending_and_wake() is not clear, it looks "obviously unneeded" because we are going to send the signal which can't be blocked or ignored. Add the comment to explain why we can't rely on send_signal_locked() and make this logic more simple/explicit. recalc_sigpending_and_wake() has no other users, it can die. In fact I think we don't even need signal_wake_up(), the target task must be either current or a TASK_TRACED child, otherwise the usage of siglock is not safe. But this needs another change. Link: https://lkml.kernel.org/r/20231120151649.GA15995@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric Biederman Signed-off-by: Andrew Morton --- include/linux/sched/signal.h | 1 - kernel/signal.c | 17 ++++------------- 2 files changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 41d6759d6a4a..015c0e3a3e1d 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -432,7 +432,6 @@ static inline bool fault_signal_pending(vm_fault_t fault_flags, * This is required every time the blocked sigset_t changes. * callers must hold sighand->siglock. */ -extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); extern void calculate_sigpending(void); diff --git a/kernel/signal.c b/kernel/signal.c index 5aa216e841a2..c9c57d053ce4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -171,16 +171,6 @@ static bool recalc_sigpending_tsk(struct task_struct *t) return false; } -/* - * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up. - * This is superfluous when called on current, the wakeup is a harmless no-op. - */ -void recalc_sigpending_and_wake(struct task_struct *t) -{ - if (recalc_sigpending_tsk(t)) - signal_wake_up(t, 0); -} - void recalc_sigpending(void) { if (!recalc_sigpending_tsk(current) && !freezing(current)) @@ -1348,10 +1338,8 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, action->sa.sa_handler = SIG_DFL; if (handler == HANDLER_EXIT) action->sa.sa_flags |= SA_IMMUTABLE; - if (blocked) { + if (blocked) sigdelset(&t->blocked, sig); - recalc_sigpending_and_wake(t); - } } /* * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect @@ -1361,6 +1349,9 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = send_signal_locked(sig, info, t, PIDTYPE_PID); + /* This can happen if the signal was already pending and blocked */ + if (!task_sigpending(t)) + signal_wake_up(t, 0); spin_unlock_irqrestore(&t->sighand->siglock, flags); return ret; -- cgit v1.2.3 From 27bbb2a0fddf70e185e800cd78f0142d45330c6c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Nov 2023 17:26:50 +0100 Subject: __ptrace_unlink: kill the obsolete "FIXME" code The corner case described by the comment is no longer possible after the commit 7b3c36fc4c23 ("ptrace: fix task_join_group_stop() for the case when current is traced"), task_join_group_stop() ensures that the new thread has the correct signr in JOBCTL_STOP_SIGMASK regardless of ptrace. Link: https://lkml.kernel.org/r/20231121162650.GA6635@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric Biederman Signed-off-by: Andrew Morton --- kernel/ptrace.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d8b5e13a2229..3617213c3d8a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -145,20 +145,9 @@ void __ptrace_unlink(struct task_struct *child) */ if (!(child->flags & PF_EXITING) && (child->signal->flags & SIGNAL_STOP_STOPPED || - child->signal->group_stop_count)) { + child->signal->group_stop_count)) child->jobctl |= JOBCTL_STOP_PENDING; - /* - * This is only possible if this thread was cloned by the - * traced task running in the stopped group, set the signal - * for the future reports. - * FIXME: we should change ptrace_init_task() to handle this - * case. - */ - if (!(child->jobctl & JOBCTL_STOP_SIGMASK)) - child->jobctl |= SIGSTOP; - } - /* * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick * @child in the butt. Note that @resume should be used iff @child -- cgit v1.2.3 From 0311d8272406b2ec47f485bef887723cc352a489 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 14 Nov 2023 17:12:01 +0100 Subject: kexec: use atomic_try_cmpxchg in crash_kexec Use atomic_try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in crash_kexec(). x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg. No functional change intended. Link: https://lkml.kernel.org/r/20231114161228.108516-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Baoquan He Cc: Eric Biederman Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index be5642a4ec49..bc4c096ab1f3 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1063,9 +1063,10 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs) * panic(). Otherwise parallel calls of panic() and crash_kexec() * may stop each other. To exclude them, we use panic_cpu here too. */ + old_cpu = PANIC_CPU_INVALID; this_cpu = raw_smp_processor_id(); - old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); - if (old_cpu == PANIC_CPU_INVALID) { + + if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { /* This is the 1st CPU which comes here, so go ahead. */ __crash_kexec(regs); -- cgit v1.2.3 From b1c3efe07987592c16d5f59ce235e6ddbea65a73 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Nov 2023 12:05:03 +0100 Subject: sched: fair: move unused stub functions to header These four functions have a normal definition for CONFIG_FAIR_GROUP_SCHED, and empty one that is only referenced when FAIR_GROUP_SCHED is disabled but CGROUP_SCHED is still enabled. If both are turned off, the functions are still defined but the misisng prototype causes a W=1 warning: kernel/sched/fair.c:12544:6: error: no previous prototype for 'free_fair_sched_group' kernel/sched/fair.c:12546:5: error: no previous prototype for 'alloc_fair_sched_group' kernel/sched/fair.c:12553:6: error: no previous prototype for 'online_fair_sched_group' kernel/sched/fair.c:12555:6: error: no previous prototype for 'unregister_fair_sched_group' Move the alternatives into the header as static inline functions with the correct combination of #ifdef checks to avoid the warning without adding even more complexity. [A different patch with the same description got applied by accident and was later reverted, but the original patch is still missing] Link: https://lkml.kernel.org/r/20231123110506.707903-4-arnd@kernel.org Fixes: 7aa55f2a5902 ("sched/fair: Move unused stub functions to header") Signed-off-by: Arnd Bergmann Cc: "David S. Miller" Cc: David Woodhouse Cc: Dinh Nguyen Cc: Greg Kroah-Hartman Cc: Ivan Kokshaysky Cc: John Paul Adrian Glaubitz Cc: Kees Cook Cc: Masahiro Yamada Cc: Matt Turner Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Cc: Tudor Ambarus Cc: Yoshinori Sato Cc: Zhihao Cheng Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 13 ------------- kernel/sched/sched.h | 11 +++++++++++ 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d7a3c63a2171..835a40eac462 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -13036,19 +13036,6 @@ next_cpu: return 0; } -#else /* CONFIG_FAIR_GROUP_SCHED */ - -void free_fair_sched_group(struct task_group *tg) { } - -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} - -void online_fair_sched_group(struct task_group *tg) { } - -void unregister_fair_sched_group(struct task_group *tg) { } - #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2e5a95486a42..8f5df5250b8d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -436,10 +436,21 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) extern int tg_nop(struct task_group *tg, void *data); +#ifdef CONFIG_FAIR_GROUP_SCHED extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); +#else +static inline void free_fair_sched_group(struct task_group *tg) { } +static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} +static inline void online_fair_sched_group(struct task_group *tg) { } +static inline void unregister_fair_sched_group(struct task_group *tg) { } +#endif + extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent); -- cgit v1.2.3 From 7acf164b259d9007264d9d8501da1023f140a3b4 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 15 Nov 2023 21:00:27 +0800 Subject: resource: add walk_system_ram_res_rev() This function, being a variant of walk_system_ram_res() introduced in commit 8c86e70acead ("resource: provide new functions to walk through resources"), walks through a list of all the resources of System RAM in reversed order, i.e., from higher to lower. It will be used in kexec_file code to load kernel, initrd etc when preparing kexec reboot. Link: https://lkml.kernel.org/r/ZVTA6z/06cLnWKUz@MiWiFi-R3L-srv Signed-off-by: AKASHI Takahiro Signed-off-by: Baoquan He Cc: Eric Biederman Signed-off-by: Andrew Morton --- include/linux/ioport.h | 3 +++ kernel/resource.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) (limited to 'kernel') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 14f5cfabbbc8..db7fe25f3370 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -331,6 +331,9 @@ extern int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); extern int +walk_system_ram_res_rev(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)); +extern int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); diff --git a/kernel/resource.c b/kernel/resource.c index 866ef3663a0b..e8a244300e5b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include @@ -429,6 +431,61 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, func); } +/* + * This function, being a variant of walk_system_ram_res(), calls the @func + * callback against all memory ranges of type System RAM which are marked as + * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from + * higher to lower. + */ +int walk_system_ram_res_rev(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)) +{ + struct resource res, *rams; + int rams_size = 16, i; + unsigned long flags; + int ret = -1; + + /* create a list */ + rams = kvcalloc(rams_size, sizeof(struct resource), GFP_KERNEL); + if (!rams) + return ret; + + flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + i = 0; + while ((start < end) && + (!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res))) { + if (i >= rams_size) { + /* re-alloc */ + struct resource *rams_new; + + rams_new = kvrealloc(rams, rams_size * sizeof(struct resource), + (rams_size + 16) * sizeof(struct resource), + GFP_KERNEL); + if (!rams_new) + goto out; + + rams = rams_new; + rams_size += 16; + } + + rams[i].start = res.start; + rams[i++].end = res.end; + + start = res.end + 1; + } + + /* go reverse */ + for (i--; i >= 0; i--) { + ret = (*func)(&rams[i], arg); + if (ret) + break; + } + +out: + kvfree(rams); + return ret; +} + /* * This function calls the @func callback against all memory ranges, which * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY. -- cgit v1.2.3 From b3ba234171cd0d58df0a13c262210ff8b5fd2830 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 14 Nov 2023 17:16:58 +0800 Subject: kexec_file: load kernel at top of system RAM if required Patch series "kexec_file: Load kernel at top of system RAM if required". Justification: ============== Kexec_load interface has been doing top down searching and loading kernel/initrd/purgtory etc to prepare for kexec reboot. In that way, the benefits are that it avoids to consume and fragment limited low memory which satisfy DMA buffer allocation and big chunk of continuous memory during system init; and avoids to stir with BIOS/FW reserved or occupied areas, or corner case handling/work around/quirk occupied areas when doing system init. By the way, the top-down searching and loading of kexec-ed kernel is done in user space utility code. For kexec_file loading, even if kexec_buf.top_down is 'true', it's simply ignored. It calls walk_system_ram_res() directly to go through all resources of System RAM bottom up, to find an available memory region, then call locate_mem_hole_callback() to allocate memory in that found memory region from top to down. This is not expected and inconsistent with kexec_load. Implementation =============== In patch 1, introduce a new function walk_system_ram_res_rev() which is a variant of walk_system_ram_res(), it walks through a list of all the resources of System RAM in reversed order, i.e., from higher to lower. In patch 2, check if kexec_buf.top_down is 'true' in kexec_walk_resources(), if yes, call walk_system_ram_res_rev() to find memory region of system RAM from top to down to load kernel/initrd etc. Background information: ======================= And I ever tried this in the past in a different way, please see below link. In the post, I tried to adjust struct sibling linking code, replace the the singly linked list with list_head so that walk_system_ram_res_rev() can be implemented in a much easier way. Finally I failed. https://lore.kernel.org/all/20180718024944.577-4-bhe@redhat.com/ This time, I picked up the patch from AKASHI Takahiro's old post and made some change to take as the current patch 1: https://lists.infradead.org/pipermail/linux-arm-kernel/2017-September/531456.html This patch (of 2): Kexec_load interface has been doing top down searching and loading kernel/initrd/purgtory etc to prepare for kexec reboot. In that way, the benefits are that it avoids to consume and fragment limited low memory which satisfy DMA buffer allocation and big chunk of continuous memory during system init; and avoids to stir with BIOS/FW reserved or occupied areas, or corner case handling/work around/quirk occupied areas when doing system init. By the way, the top-down searching and loading of kexec-ed kernel is done in user space utility code. For kexec_file loading, even if kexec_buf.top_down is 'true', it's simply ignored. It calls walk_system_ram_res() directly to go through all resources of System RAM bottom up, to find an available memory region, then call locate_mem_hole_callback() to allocate memory in that found memory region from top to down. This is not expected and inconsistent with kexec_load. Here check if kexec_buf.top_down is 'true' in kexec_walk_resources(), if yes, call the newly added walk_system_ram_res_rev() to find memory region of system RAM from top to down to load kernel/initrd etc. Link: https://lkml.kernel.org/r/20231114091658.228030-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20231114091658.228030-3-bhe@redhat.com Signed-off-by: Baoquan He Cc: AKASHI Takahiro Cc: Baoquan He Cc: Eric Biederman Signed-off-by: Andrew Morton --- kernel/kexec_file.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f9a419cd22d4..ba3ef30921b8 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -592,6 +592,8 @@ static int kexec_walk_resources(struct kexec_buf *kbuf, IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, crashk_res.start, crashk_res.end, kbuf, func); + else if (kbuf->top_down) + return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func); else return walk_system_ram_res(0, ULONG_MAX, kbuf, func); } -- cgit v1.2.3 From 9d02330abd3ecdacc43fc6b16a5668e7e94b7562 Mon Sep 17 00:00:00 2001 From: Li Zhe Date: Thu, 23 Nov 2023 16:40:22 +0800 Subject: softlockup: serialized softlockup's log If multiple CPUs trigger softlockup at the same time with 'softlockup_all_cpu_backtrace=0', the softlockup's logs will appear staggeredly in dmesg, which will affect the viewing of the logs for developer. Since the code path for outputting softlockup logs is not a kernel hotspot and the performance requirements for the code are not strict, locks are used to serialize the softlockup log output to improve the readability of the logs. Link: https://lkml.kernel.org/r/20231123084022.10302-1-lizhe.67@bytedance.com Signed-off-by: Li Zhe Reviewed-by: Petr Mladek Reviewed-by: Douglas Anderson Cc: Lecopzer Chen Cc: Pingfan Liu Cc: Zefan Li Cc: John Ogness Signed-off-by: Andrew Morton --- kernel/watchdog.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5cd6d4e26915..bf30a6fac665 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -448,6 +448,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) struct pt_regs *regs = get_irq_regs(); int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; + static DEFINE_SPINLOCK(watchdog_output_lock); if (!watchdog_enabled) return HRTIMER_NORESTART; @@ -514,6 +515,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* Start period for the next softlockup warning. */ update_report_ts(); + spin_lock(&watchdog_output_lock); pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -523,6 +525,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) show_regs(regs); else dump_stack(); + spin_unlock(&watchdog_output_lock); if (softlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(smp_processor_id()); -- cgit v1.2.3 From cbc2fe9d9cb226347365753f50d81bc48cc3c52e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 13 Dec 2023 13:57:41 +0800 Subject: kexec_file: add kexec_file flag to control debug printing Patch series "kexec_file: print out debugging message if required", v4. Currently, specifying '-d' on kexec command will print a lot of debugging informationabout kexec/kdump loading with kexec_load interface. However, kexec_file_load prints nothing even though '-d' is specified. It's very inconvenient to debug or analyze the kexec/kdump loading when something wrong happened with kexec/kdump itself or develper want to check the kexec/kdump loading. In this patchset, a kexec_file flag is KEXEC_FILE_DEBUG added and checked in code. If it's passed in, debugging message of kexec_file code will be printed out and can be seen from console and dmesg. Otherwise, the debugging message is printed like beofre when pr_debug() is taken. Note: **** ===== 1) The code in kexec-tools utility also need be changed to support passing KEXEC_FILE_DEBUG to kernel when 'kexec -s -d' is specified. The patch link is here: ========= [PATCH] kexec_file: add kexec_file flag to support debug printing http://lists.infradead.org/pipermail/kexec/2023-November/028505.html 2) s390 also has kexec_file code, while I am not sure what debugging information is necessary. So leave it to s390 developer. Test: **** ==== Testing was done in v1 on x86_64 and arm64. For v4, tested on x86_64 again. And on x86_64, the printed messages look like below: -------------------------------------------------------------- kexec measurement buffer for the loaded kernel at 0x207fffe000. Loaded purgatory at 0x207fff9000 Loaded boot_param, command line and misc at 0x207fff3000 bufsz=0x1180 memsz=0x1180 Loaded 64bit kernel at 0x207c000000 bufsz=0xc88200 memsz=0x3c4a000 Loaded initrd at 0x2079e79000 bufsz=0x2186280 memsz=0x2186280 Final command line is: root=/dev/mapper/fedora_intel--knightslanding--lb--02-root ro rd.lvm.lv=fedora_intel-knightslanding-lb-02/root console=ttyS0,115200N81 crashkernel=256M E820 memmap: 0000000000000000-000000000009a3ff (1) 000000000009a400-000000000009ffff (2) 00000000000e0000-00000000000fffff (2) 0000000000100000-000000006ff83fff (1) 000000006ff84000-000000007ac50fff (2) ...... 000000207fff6150-000000207fff615f (128) 000000207fff6160-000000207fff714f (1) 000000207fff7150-000000207fff715f (128) 000000207fff7160-000000207fff814f (1) 000000207fff8150-000000207fff815f (128) 000000207fff8160-000000207fffffff (1) nr_segments = 5 segment[0]: buf=0x000000004e5ece74 bufsz=0x211 mem=0x207fffe000 memsz=0x1000 segment[1]: buf=0x000000009e871498 bufsz=0x4000 mem=0x207fff9000 memsz=0x5000 segment[2]: buf=0x00000000d879f1fe bufsz=0x1180 mem=0x207fff3000 memsz=0x2000 segment[3]: buf=0x000000001101cd86 bufsz=0xc88200 mem=0x207c000000 memsz=0x3c4a000 segment[4]: buf=0x00000000c6e38ac7 bufsz=0x2186280 mem=0x2079e79000 memsz=0x2187000 kexec_file_load: type:0, start:0x207fff91a0 head:0x109e004002 flags:0x8 --------------------------------------------------------------------------- This patch (of 7): When specifying 'kexec -c -d', kexec_load interface will print loading information, e.g the regions where kernel/initrd/purgatory/cmdline are put, the memmap passed to 2nd kernel taken as system RAM ranges, and printing all contents of struct kexec_segment, etc. These are very helpful for analyzing or positioning what's happening when kexec/kdump itself failed. The debugging printing for kexec_load interface is made in user space utility kexec-tools. Whereas, with kexec_file_load interface, 'kexec -s -d' print nothing. Because kexec_file code is mostly implemented in kernel space, and the debugging printing functionality is missed. It's not convenient when debugging kexec/kdump loading and jumping with kexec_file_load interface. Now add KEXEC_FILE_DEBUG to kexec_file flag to control the debugging message printing. And add global variable kexec_file_dbg_print and macro kexec_dprintk() to facilitate the printing. This is a preparation, later kexec_dprintk() will be used to replace the existing pr_debug(). Once 'kexec -s -d' is specified, it will print out kexec/kdump loading information. If '-d' is not specified, it regresses to pr_debug(). Link: https://lkml.kernel.org/r/20231213055747.61826-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20231213055747.61826-2-bhe@redhat.com Signed-off-by: Baoquan He Cc: Conor Dooley Cc: Joe Perches Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/kexec.h | 9 ++++++++- include/uapi/linux/kexec.h | 1 + kernel/kexec_core.c | 2 ++ kernel/kexec_file.c | 3 +++ 4 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 8227455192b7..400cb6c02176 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -403,7 +403,7 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ - KEXEC_FILE_NO_INITRAMFS) + KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; @@ -500,6 +500,13 @@ static inline int crash_hotplug_memory_support(void) { return 0; } static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } #endif +extern bool kexec_file_dbg_print; + +#define kexec_dprintk(fmt, ...) \ + printk("%s" fmt, \ + kexec_file_dbg_print ? KERN_INFO : KERN_DEBUG, \ + ##__VA_ARGS__) + #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; struct task_struct; diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 01766dd839b0..c17bb096ea68 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -25,6 +25,7 @@ #define KEXEC_FILE_UNLOAD 0x00000001 #define KEXEC_FILE_ON_CRASH 0x00000002 #define KEXEC_FILE_NO_INITRAMFS 0x00000004 +#define KEXEC_FILE_DEBUG 0x00000008 /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index bc4c096ab1f3..64072acef2b6 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -52,6 +52,8 @@ atomic_t __kexec_lock = ATOMIC_INIT(0); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; +bool kexec_file_dbg_print; + int kexec_should_crash(struct task_struct *p) { /* diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index ba3ef30921b8..3ee204474de6 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -123,6 +123,8 @@ void kimage_file_post_load_cleanup(struct kimage *image) */ kfree(image->image_loader_data); image->image_loader_data = NULL; + + kexec_file_dbg_print = false; } #ifdef CONFIG_KEXEC_SIG @@ -278,6 +280,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, if (!image) return -ENOMEM; + kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG); image->file_mode = 1; if (kexec_on_panic) { -- cgit v1.2.3 From a85ee18c7900f001f42082d2fabce4eaf57e655f Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 13 Dec 2023 13:57:42 +0800 Subject: kexec_file: print out debugging message if required Then when specifying '-d' for kexec_file_load interface, loaded locations of kernel/initrd/cmdline etc can be printed out to help debug. Here replace pr_debug() with the newly added kexec_dprintk() in kexec_file loading related codes. And also print out type/start/head of kimage and flags to help debug. Link: https://lkml.kernel.org/r/20231213055747.61826-3-bhe@redhat.com Signed-off-by: Baoquan He Cc: Conor Dooley Cc: Joe Perches Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- kernel/crash_core.c | 8 +++++--- kernel/kexec_file.c | 11 ++++++++--- security/integrity/ima/ima_kexec.c | 4 ++-- 3 files changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index d4313b53837e..c97e825a0fd9 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -551,9 +551,11 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; phdr->p_align = 0; ehdr->e_phnum++; - pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", - phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, - ehdr->e_phnum, phdr->p_offset); +#ifdef CONFIG_KEXEC_FILE + kexec_dprintk("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, + ehdr->e_phnum, phdr->p_offset); +#endif phdr++; } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 3ee204474de6..aca5f3668f4c 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -204,6 +204,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, if (ret < 0) return ret; image->kernel_buf_len = ret; + kexec_dprintk("kernel: %p kernel_size: %#lx\n", + image->kernel_buf, image->kernel_buf_len); /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, @@ -387,13 +389,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; + kexec_dprintk("nr_segments = %lu\n", image->nr_segments); for (i = 0; i < image->nr_segments; i++) { struct kexec_segment *ksegment; ksegment = &image->segment[i]; - pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", - i, ksegment->buf, ksegment->bufsz, ksegment->mem, - ksegment->memsz); + kexec_dprintk("segment[%d]: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", + i, ksegment->buf, ksegment->bufsz, ksegment->mem, + ksegment->memsz); ret = kimage_load_segment(image, &image->segment[i]); if (ret) @@ -406,6 +409,8 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; + kexec_dprintk("kexec_file_load: type:%u, start:0x%lx head:0x%lx flags:0x%lx\n", + image->type, image->start, image->head, flags); /* * Free up any temporary buffers allocated which are not needed * after image has been loaded diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c index ad133fe120db..dadc1d138118 100644 --- a/security/integrity/ima/ima_kexec.c +++ b/security/integrity/ima/ima_kexec.c @@ -129,8 +129,8 @@ void ima_add_kexec_buffer(struct kimage *image) image->ima_buffer_size = kexec_segment_size; image->ima_buffer = kexec_buffer; - pr_debug("kexec measurement buffer for the loaded kernel at 0x%lx.\n", - kbuf.mem); + kexec_dprintk("kexec measurement buffer for the loaded kernel at 0x%lx.\n", + kbuf.mem); } #endif /* IMA_KEXEC */ -- cgit v1.2.3 From a903904c5fa06e8d8472509741f79e8eb25ff864 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Fri, 8 Dec 2023 16:41:15 +0800 Subject: fork: remove redundant TASK_UNINTERRUPTIBLE TASK_KILLABLE already includes TASK_UNINTERRUPTIBLE, so there is no need to add a separate TASK_UNINTERRUPTIBLE. Link: https://lkml.kernel.org/r/20231208084115.1973285-1-haokexin@gmail.com Signed-off-by: Kevin Hao Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ce8a4b8c04e2..d71c8ade8f9c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1562,7 +1562,7 @@ static void complete_vfork_done(struct task_struct *tsk) static int wait_for_vfork_done(struct task_struct *child, struct completion *vfork) { - unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE; + unsigned int state = TASK_KILLABLE|TASK_FREEZABLE; int killed; cgroup_enter_frozen(); -- cgit v1.2.3 From db6b6fb70193f0defe4d5785e940156c06e9abbe Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Tue, 12 Dec 2023 22:27:06 +0800 Subject: kexec: use ALIGN macro instead of open-coding it Use ALIGN macro instead of open-coding it to improve code readability. Link: https://lkml.kernel.org/r/20231212142706.25149-1-ytcoode@gmail.com Signed-off-by: Yuntao Wang Acked-by: Baoquan He Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 64072acef2b6..6e0f022987ff 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -432,7 +432,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, pages = NULL; size = (1 << order) << PAGE_SHIFT; - hole_start = (image->control_page + (size - 1)) & ~(size - 1); + hole_start = ALIGN(image->control_page, size); hole_end = hole_start + size - 1; while (hole_end <= crashk_res.end) { unsigned long i; @@ -449,7 +449,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, mend = mstart + image->segment[i].memsz - 1; if ((hole_end >= mstart) && (hole_start <= mend)) { /* Advance the hole to the end of the segment */ - hole_start = (mend + (size - 1)) & ~(size - 1); + hole_start = ALIGN(mend, size); hole_end = hole_start + size - 1; break; } -- cgit v1.2.3 From 4459cd2e167e7208e57d517d16282408d9035dad Mon Sep 17 00:00:00 2001 From: Wang Jinchao Date: Fri, 15 Dec 2023 16:54:51 +0800 Subject: crash_core: remove duplicated including of kexec.h Remove second include of linux/kexec.h Link: https://lkml.kernel.org/r/202312151654+0800-wangjinchao@xfusion.com Signed-off-by: Wang Jinchao Acked-by: Baoquan He Cc: Dave Young Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index c97e825a0fd9..6f074e112c1e 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From 816d334afa85c836080b41bb6238aea845615ad9 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Sun, 17 Dec 2023 11:35:26 +0800 Subject: kexec: modify the meaning of the end parameter in kimage_is_destination_range() The end parameter received by kimage_is_destination_range() should be the last valid byte address of the target memory segment plus 1. However, in the locate_mem_hole_bottom_up() and locate_mem_hole_top_down() functions, the corresponding value passed to kimage_is_destination_range() is the last valid byte address of the target memory segment, which is 1 less. There are two ways to fix this bug. We can either correct the logic of the locate_mem_hole_bottom_up() and locate_mem_hole_top_down() functions, or we can fix kimage_is_destination_range() by making the end parameter represent the last valid byte address of the target memory segment. Here, we choose the second approach. Due to the modification to kimage_is_destination_range(), we also need to adjust its callers, such as kimage_alloc_normal_control_pages() and kimage_alloc_page(). Link: https://lkml.kernel.org/r/20231217033528.303333-2-ytcoode@gmail.com Signed-off-by: Yuntao Wang Acked-by: Baoquan He Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: "Eric W. Biederman" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 6e0f022987ff..2f039a7d9af9 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -278,8 +278,8 @@ int kimage_is_destination_range(struct kimage *image, unsigned long mstart, mend; mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((end > mstart) && (start < mend)) + mend = mstart + image->segment[i].memsz - 1; + if ((end >= mstart) && (start <= mend)) return 1; } @@ -372,7 +372,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, pfn = page_to_boot_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; - eaddr = epfn << PAGE_SHIFT; + eaddr = (epfn << PAGE_SHIFT) - 1; if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || kimage_is_destination_range(image, addr, eaddr)) { list_add(&pages->lru, &extra_pages); @@ -718,7 +718,7 @@ static struct page *kimage_alloc_page(struct kimage *image, /* If the page is not a destination page use it */ if (!kimage_is_destination_range(image, addr, - addr + PAGE_SIZE)) + addr + PAGE_SIZE - 1)) break; /* -- cgit v1.2.3 From 18d565ea95fe553f442c5bbc5050415bab3c3fa4 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Sun, 17 Dec 2023 11:35:27 +0800 Subject: kexec_file: fix incorrect temp_start value in locate_mem_hole_top_down() temp_end represents the address of the last available byte. Therefore, the starting address of the memory segment with temp_end as its last available byte and a size of `kbuf->memsz`, that is, the value of temp_start, should be `temp_end - kbuf->memsz + 1` instead of `temp_end - kbuf->memsz`. Additionally, use the ALIGN_DOWN macro instead of open-coding it directly in locate_mem_hole_top_down() to improve code readability. Link: https://lkml.kernel.org/r/20231217033528.303333-3-ytcoode@gmail.com Signed-off-by: Yuntao Wang Acked-by: Baoquan He Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: "Eric W. Biederman" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/kexec_file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index aca5f3668f4c..bef2f6f2571b 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -434,11 +434,11 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end, unsigned long temp_start, temp_end; temp_end = min(end, kbuf->buf_max); - temp_start = temp_end - kbuf->memsz; + temp_start = temp_end - kbuf->memsz + 1; do { /* align down start */ - temp_start = temp_start & (~(kbuf->buf_align - 1)); + temp_start = ALIGN_DOWN(temp_start, kbuf->buf_align); if (temp_start < start || temp_start < kbuf->buf_min) return 0; -- cgit v1.2.3 From d391615618e8b2c30ef1e09c1705a7b1751f74f0 Mon Sep 17 00:00:00 2001 From: Ahelenia Ziemiańska Date: Tue, 19 Dec 2023 23:24:14 +0100 Subject: kernel: relay: remove relay_file_splice_read dead code, doesn't work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Documentation/filesystems/relay.rst says to use return debugfs_create_file(filename, mode, parent, buf, &relay_file_operations); and this is the only way relay_file_operations is used. Thus: debugfs_create_file(&relay_file_operations) -> __debugfs_create_file(&debugfs_full_proxy_file_operations, &relay_file_operations) -> dentry{inode: {i_fop: &debugfs_full_proxy_file_operations}, d_fsdata: &relay_file_operations | DEBUGFS_FSDATA_IS_REAL_FOPS_BIT} debugfs_full_proxy_file_operations.open is full_proxy_open, which extracts the &relay_file_operations from the dentry, and allocates via __full_proxy_fops_init() new fops, with trivial wrappers around release, llseek, read, write, poll, and unlocked_ioctl, then replaces the fops on the opened file therewith. Naturally, all thusly-created debugfs files have .splice_read = NULL. This was introduced in commit 49d200deaa68 ("debugfs: prevent access to removed files' private data") from 2016-03-22. AFAICT, relay_file_operations is the only struct file_operations used for debugfs which defines a .splice_read callback. Hooking it up with > diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c > index 5063434be0fc..952fcf5b2afa 100644 > --- a/fs/debugfs/file.c > +++ b/fs/debugfs/file.c > @@ -328,6 +328,11 @@ FULL_PROXY_FUNC(write, ssize_t, filp, > loff_t *ppos), > ARGS(filp, buf, size, ppos)); > > +FULL_PROXY_FUNC(splice_read, long, in, > + PROTO(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, > + size_t len, unsigned int flags), > + ARGS(in, ppos, pipe, len, flags)); > + > FULL_PROXY_FUNC(unlocked_ioctl, long, filp, > PROTO(struct file *filp, unsigned int cmd, unsigned long arg), > ARGS(filp, cmd, arg)); > @@ -382,6 +387,8 @@ static void __full_proxy_fops_init(struct file_operations *proxy_fops, > proxy_fops->write = full_proxy_write; > if (real_fops->poll) > proxy_fops->poll = full_proxy_poll; > + if (real_fops->splice_read) > + proxy_fops->splice_read = full_proxy_splice_read; > if (real_fops->unlocked_ioctl) > proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl; > } shows it just doesn't work, and splicing always instantly returns empty (subsequent reads actually return the contents). No-one noticed it became dead code in 2016, who knows if it worked back then. Clearly no-one cares; just delete it. Link: https://lkml.kernel.org/r/dtexwpw6zcdx7dkx3xj5gyjp5syxmyretdcbcdtvrnukd4vvuh@tarta.nabijaczleweli.xyz Signed-off-by: Ahelenia Ziemiańska Cc: Greg Kroah-Hartman Cc: Li kunyu Cc: Mike Rapoport (IBM) Cc: Rafael J. Wysocki Cc: Suren Baghdasaryan Cc: Zhang Zhengming Cc: Zhao Lei Signed-off-by: Andrew Morton --- kernel/relay.c | 162 --------------------------------------------------------- 1 file changed, 162 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 83fe0325cde1..a8e90e98bf2c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1073,167 +1073,6 @@ static ssize_t relay_file_read(struct file *filp, return written; } -static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) -{ - rbuf->bytes_consumed += bytes_consumed; - - if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) { - relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1); - rbuf->bytes_consumed %= rbuf->chan->subbuf_size; - } -} - -static void relay_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - struct rchan_buf *rbuf; - - rbuf = (struct rchan_buf *)page_private(buf->page); - relay_consume_bytes(rbuf, buf->private); -} - -static const struct pipe_buf_operations relay_pipe_buf_ops = { - .release = relay_pipe_buf_release, - .try_steal = generic_pipe_buf_try_steal, - .get = generic_pipe_buf_get, -}; - -static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) -{ -} - -/* - * subbuf_splice_actor - splice up to one subbuf's worth of data - */ -static ssize_t subbuf_splice_actor(struct file *in, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags, - int *nonpad_ret) -{ - unsigned int pidx, poff, total_len, subbuf_pages, nr_pages; - struct rchan_buf *rbuf = in->private_data; - unsigned int subbuf_size = rbuf->chan->subbuf_size; - uint64_t pos = (uint64_t) *ppos; - uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size; - size_t read_start = (size_t) do_div(pos, alloc_size); - size_t read_subbuf = read_start / subbuf_size; - size_t padding = rbuf->padding[read_subbuf]; - size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; - struct page *pages[PIPE_DEF_BUFFERS]; - struct partial_page partial[PIPE_DEF_BUFFERS]; - struct splice_pipe_desc spd = { - .pages = pages, - .nr_pages = 0, - .nr_pages_max = PIPE_DEF_BUFFERS, - .partial = partial, - .ops = &relay_pipe_buf_ops, - .spd_release = relay_page_release, - }; - ssize_t ret; - - if (rbuf->subbufs_produced == rbuf->subbufs_consumed) - return 0; - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - - /* - * Adjust read len, if longer than what is available - */ - if (len > (subbuf_size - read_start % subbuf_size)) - len = subbuf_size - read_start % subbuf_size; - - subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; - pidx = (read_start / PAGE_SIZE) % subbuf_pages; - poff = read_start & ~PAGE_MASK; - nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max); - - for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { - unsigned int this_len, this_end, private; - unsigned int cur_pos = read_start + total_len; - - if (!len) - break; - - this_len = min_t(unsigned long, len, PAGE_SIZE - poff); - private = this_len; - - spd.pages[spd.nr_pages] = rbuf->page_array[pidx]; - spd.partial[spd.nr_pages].offset = poff; - - this_end = cur_pos + this_len; - if (this_end >= nonpad_end) { - this_len = nonpad_end - cur_pos; - private = this_len + padding; - } - spd.partial[spd.nr_pages].len = this_len; - spd.partial[spd.nr_pages].private = private; - - len -= this_len; - total_len += this_len; - poff = 0; - pidx = (pidx + 1) % subbuf_pages; - - if (this_end >= nonpad_end) { - spd.nr_pages++; - break; - } - } - - ret = 0; - if (!spd.nr_pages) - goto out; - - ret = *nonpad_ret = splice_to_pipe(pipe, &spd); - if (ret < 0 || ret < total_len) - goto out; - - if (read_start + ret == nonpad_end) - ret += padding; - -out: - splice_shrink_spd(&spd); - return ret; -} - -static ssize_t relay_file_splice_read(struct file *in, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags) -{ - ssize_t spliced; - int ret; - int nonpad_ret = 0; - - ret = 0; - spliced = 0; - - while (len && !spliced) { - ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); - if (ret < 0) - break; - else if (!ret) { - if (flags & SPLICE_F_NONBLOCK) - ret = -EAGAIN; - break; - } - - *ppos += ret; - if (ret > len) - len = 0; - else - len -= ret; - spliced += nonpad_ret; - nonpad_ret = 0; - } - - if (spliced) - return spliced; - - return ret; -} const struct file_operations relay_file_operations = { .open = relay_file_open, @@ -1242,6 +1081,5 @@ const struct file_operations relay_file_operations = { .read = relay_file_read, .llseek = no_llseek, .release = relay_file_release, - .splice_read = relay_file_splice_read, }; EXPORT_SYMBOL_GPL(relay_file_operations); -- cgit v1.2.3 From 5f981878c71eaa8dc5563805152bd129a73a90be Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 19 Dec 2023 21:49:45 -0800 Subject: stacktrace: fix kernel-doc typo Change @task to @tsk to prevent kernel-doc warnings: kernel/stacktrace.c:138: warning: Excess function parameter 'task' description in 'stack_trace_save_tsk' kernel/stacktrace.c:138: warning: Function parameter or member 'tsk' not described in 'stack_trace_save_tsk' Link: https://lkml.kernel.org/r/20231220054945.17663-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton --- kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 4f65824879ab..afb3c116da91 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(stack_trace_save); /** * stack_trace_save_tsk - Save a task stack trace into a storage array - * @task: The task to examine + * @tsk: The task to examine * @store: Pointer to storage array * @size: Size of the storage array * @skipnr: Number of entries to skip at the start of the stack trace -- cgit v1.2.3 From 2861b37732627d7d115d77585ce4853f25cf332d Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Thu, 21 Dec 2023 12:23:08 +0800 Subject: kexec_core: fix the assignment to kimage->control_page image->control_page represents the starting address for allocating the next control page, while hole_end represents the address of the last valid byte of the currently allocated control page. This bug actually does not affect the correctness of allocating control pages, because image->control_page is currently only used in kimage_alloc_crash_control_pages(), and this function, when allocating control pages, will first align image->control_page up to the nearest `(1 << order) << PAGE_SHIFT` boundary, then use this value as the starting address of the next control page. This ensures that the newly allocated control page will use the correct starting address and not overlap with previously allocated control pages. Although it does not affect the correctness of the final result, it is better for us to set image->control_page to the correct value, in case it might be used elsewhere in the future, potentially causing errors. Therefore, after successfully allocating a control page, image->control_page should be updated to `hole_end + 1`, rather than hole_end. Link: https://lkml.kernel.org/r/20231221042308.11076-1-ytcoode@gmail.com Signed-off-by: Yuntao Wang Cc: Baoquan He Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 2f039a7d9af9..a08031b57a61 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -457,7 +457,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, /* If I don't overlap any segments I have found my hole! */ if (i == image->nr_segments) { pages = pfn_to_page(hole_start >> PAGE_SHIFT); - image->control_page = hole_end; + image->control_page = hole_end + 1; break; } } -- cgit v1.2.3 From 6dcde5d5f248291c5ff6cbe00a7fa6ae400d1aa9 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 20 Dec 2023 13:15:34 -0800 Subject: watchdog/hardlockup: adopt softlockup logic avoiding double-dumps Patch series "watchdog: Better handling of concurrent lockups". When we get multiple lockups at roughly the same time, the output in the kernel logs can be very confusing since the reports about the lockups end up interleaved in the logs. There is some code in the kernel to try to handle this but it wasn't that complete. Li Zhe recently made this a bit better for softlockups (specifically for the case where `kernel.softlockup_all_cpu_backtrace` is not set) in commit 9d02330abd3e ("softlockup: serialized softlockup's log"), but that only handled softlockup reports. Hardlockup reports still had similar issues. This series also has a small fix to avoid dumping all stacks a second time in the case of a panic. This is a bit unrelated to the interleaving fixes but it does also improve the clarity of lockup reports. This patch (of 4): The hardlockup detector and softlockup detector both have the ability to dump the stack of all CPUs (`kernel.hardlockup_all_cpu_backtrace` and `kernel.softlockup_all_cpu_backtrace`). Both detectors also have some logic to attempt to avoid interleaving printouts if two CPUs were trying to do dumps of all CPUs at the same time. However: - The hardlockup detector's logic still allowed interleaving some information. Specifically another CPU could print modules and dump the stack of the locked CPU at the same time we were dumping all CPUs. - In the case where `kernel.hardlockup_panic` was set in addition to `kernel.hardlockup_all_cpu_backtrace`, when two CPUs both detected hardlockups at the same time the second CPU could call panic() while the first was still dumping stacks. This was especially bad if the locked up CPU wasn't responding to the request for a backtrace since the function nmi_trigger_cpumask_backtrace() can wait up to 10 seconds. Let's resolve this by adopting the softlockup logic in the hardlockup handler. NOTES: - As part of this, one might think that we should make a helper function that both the hard and softlockup detectors call. This turns out not to be super trivial since it would have to be parameterized quite a bit since there are separate global variables controlling each lockup detector and they print log messages that are just different enough that it would be a pain. We probably don't want to change the messages that are printed without good reason to avoid throwing log parsers for a loop. - One might also think that it would be a good idea to have the hardlockup and softlockup detector use the same global variable to prevent interleaving. This would make sure that softlockups and hardlockups can't interleave each other. That _almost_ works but has a dangerous flaw if `kernel.hardlockup_panic` is not the same as `kernel.softlockup_panic` because we might skip a call to panic() if one type of lockup was detected at the same time as another. Link: https://lkml.kernel.org/r/20231220211640.2023645-1-dianders@chromium.org Link: https://lkml.kernel.org/r/20231220131534.1.I4f35a69fbb124b5f0c71f75c631e11fabbe188ff@changeid Signed-off-by: Douglas Anderson Cc: John Ogness Cc: Lecopzer Chen Cc: Li Zhe Cc: Petr Mladek Cc: Pingfan Liu Signed-off-by: Andrew Morton --- kernel/watchdog.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index bf30a6fac665..b4fd2f12137f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -91,7 +91,7 @@ static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts); static DEFINE_PER_CPU(int, hrtimer_interrupts_saved); static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned); static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched); -static unsigned long watchdog_hardlockup_all_cpu_dumped; +static unsigned long hard_lockup_nmi_warn; notrace void arch_touch_nmi_watchdog(void) { @@ -156,6 +156,15 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) if (per_cpu(watchdog_hardlockup_warned, cpu)) return; + /* + * Prevent multiple hard-lockup reports if one cpu is already + * engaged in dumping all cpu back traces. + */ + if (sysctl_hardlockup_all_cpu_backtrace) { + if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn)) + return; + } + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu); print_modules(); print_irqtrace_events(current); @@ -168,13 +177,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) trigger_single_cpu_backtrace(cpu); } - /* - * Perform multi-CPU dump only once to avoid multiple - * hardlockups generating interleaving traces - */ - if (sysctl_hardlockup_all_cpu_backtrace && - !test_and_set_bit(0, &watchdog_hardlockup_all_cpu_dumped)) + if (sysctl_hardlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(cpu); + clear_bit_unlock(0, &hard_lockup_nmi_warn); + } if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); -- cgit v1.2.3 From 896260a6d69d01ceb1aad8cfe4298bd837a67432 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 20 Dec 2023 13:15:35 -0800 Subject: watchdog/softlockup: use printk_cpu_sync_get_irqsave() to serialize reporting Instead of introducing a spinlock, use printk_cpu_sync_get_irqsave() and printk_cpu_sync_put_irqrestore() to serialize softlockup reporting. Alone this doesn't have any real advantage over the spinlock, but this will allow us to use the same function in a future change to also serialize hardlockup crawls. NOTE: for the most part this serialization is important because we often end up in the show_regs() path and that has no built-in serialization if there are multiple callers at once. However, even in the case where we end up in the dump_stack() path this still has some advantages because the stack will be guaranteed to be together in the logs with the lockup message with no interleaving. NOTE: the fact that printk_cpu_sync_get_irqsave() is allowed to be called multiple times on the same CPU is important here. Specifically we hold the "lock" while calling dump_stack() which also gets the same "lock". This is explicitly documented to be OK and means we don't need to introduce a variant of dump_stack() that doesn't grab the lock. Link: https://lkml.kernel.org/r/20231220131534.2.Ia5906525d440d8e8383cde31b7c61c2aadc8f907@changeid Signed-off-by: Douglas Anderson Reviewed-by: John Ogness Reviewed-by: Li Zhe Cc: Lecopzer Chen Cc: Petr Mladek Cc: Pingfan Liu Signed-off-by: Andrew Morton --- kernel/watchdog.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b4fd2f12137f..526041a1100a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -454,7 +454,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) struct pt_regs *regs = get_irq_regs(); int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; - static DEFINE_SPINLOCK(watchdog_output_lock); + unsigned long flags; if (!watchdog_enabled) return HRTIMER_NORESTART; @@ -521,7 +521,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* Start period for the next softlockup warning. */ update_report_ts(); - spin_lock(&watchdog_output_lock); + printk_cpu_sync_get_irqsave(flags); pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -531,7 +531,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) show_regs(regs); else dump_stack(); - spin_unlock(&watchdog_output_lock); + printk_cpu_sync_put_irqrestore(flags); if (softlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(smp_processor_id()); -- cgit v1.2.3 From ee6bdb3f4bf046ff7878c6103b8c88bb4ccfb11d Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 20 Dec 2023 13:15:36 -0800 Subject: watchdog/hardlockup: use printk_cpu_sync_get_irqsave() to serialize reporting If two CPUs end up reporting a hardlockup at the same time then their logs could get interleaved which is hard to read. The interleaving problem was especially bad with the "perf" hardlockup detector where the locked up CPU is always the same as the running CPU and we end up in show_regs(). show_regs() has no inherent serialization so we could mix together two crawls if two hardlockups happened at the same time (and if we didn't have `sysctl_hardlockup_all_cpu_backtrace` set). With this change we'll fully serialize hardlockups when using the "perf" hardlockup detector. The interleaving problem was less bad with the "buddy" hardlockup detector. With "buddy" we always end up calling `trigger_single_cpu_backtrace(cpu)` on some CPU other than the running one. trigger_single_cpu_backtrace() always at least serializes the individual stack crawls because it eventually uses printk_cpu_sync_get_irqsave(). Unfortunately the fact that trigger_single_cpu_backtrace() eventually calls printk_cpu_sync_get_irqsave() (on a different CPU) means that we have to drop the "lock" before calling it and we can't fully serialize all printouts associated with a given hardlockup. However, we still do get the advantage of serializing the output of print_modules() and print_irqtrace_events(). Aside from serializing hardlockups from each other, this change also has the advantage of serializing hardlockups and softlockups from each other if they happen to happen at the same time since they are both using the same "lock". Even though nobody is expected to hang while holding the lock associated with printk_cpu_sync_get_irqsave(), out of an abundance of caution, we don't call printk_cpu_sync_get_irqsave() until after we print out about the hardlockup. This makes extra sure that, even if printk_cpu_sync_get_irqsave() somehow never runs we at least print that we saw the hardlockup. This is different than the choice made for softlockup because hardlockup is really our last resort. Link: https://lkml.kernel.org/r/20231220131534.3.I6ff691b3b40f0379bc860f80c6e729a0485b5247@changeid Signed-off-by: Douglas Anderson Reviewed-by: John Ogness Cc: Lecopzer Chen Cc: Li Zhe Cc: Petr Mladek Cc: Pingfan Liu Signed-off-by: Andrew Morton --- kernel/watchdog.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 526041a1100a..11f9577accca 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -151,6 +151,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) */ if (is_hardlockup(cpu)) { unsigned int this_cpu = smp_processor_id(); + unsigned long flags; /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) @@ -165,7 +166,17 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) return; } + /* + * NOTE: we call printk_cpu_sync_get_irqsave() after printing + * the lockup message. While it would be nice to serialize + * that printout, we really want to make sure that if some + * other CPU somehow locked up while holding the lock associated + * with printk_cpu_sync_get_irqsave() that we can still at least + * get the message about the lockup out. + */ pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu); + printk_cpu_sync_get_irqsave(flags); + print_modules(); print_irqtrace_events(current); if (cpu == this_cpu) { @@ -173,7 +184,9 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) show_regs(regs); else dump_stack(); + printk_cpu_sync_put_irqrestore(flags); } else { + printk_cpu_sync_put_irqrestore(flags); trigger_single_cpu_backtrace(cpu); } -- cgit v1.2.3 From 55efe4abf927aca3692870a1851067f309e9a374 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 20 Dec 2023 13:15:37 -0800 Subject: watchdog: if panicking and we dumped everything, don't re-enable dumping If, as part of handling a hardlockup or softlockup, we've already dumped all CPUs and we're just about to panic, don't reenable dumping and give some other CPU a chance to hop in there and add some confusing logs right as the panic is happening. Link: https://lkml.kernel.org/r/20231220131534.4.Id3a9c7ec2d7d83e4080da6f8662ba2226b40543f@changeid Signed-off-by: Douglas Anderson Cc: John Ogness Cc: Lecopzer Chen Cc: Li Zhe Cc: Petr Mladek Cc: Pingfan Liu Signed-off-by: Andrew Morton --- kernel/watchdog.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 11f9577accca..81a8862295d6 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -192,7 +192,8 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) if (sysctl_hardlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(cpu); - clear_bit_unlock(0, &hard_lockup_nmi_warn); + if (!hardlockup_panic) + clear_bit_unlock(0, &hard_lockup_nmi_warn); } if (hardlockup_panic) @@ -548,7 +549,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (softlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(smp_processor_id()); - clear_bit_unlock(0, &soft_lockup_nmi_warn); + if (!softlockup_panic) + clear_bit_unlock(0, &soft_lockup_nmi_warn); } add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); -- cgit v1.2.3 From 6dff315972640bfe542e2d044933751afd8e6c4a Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Tue, 2 Jan 2024 22:49:05 +0800 Subject: crash_core: fix and simplify the logic of crash_exclude_mem_range() The purpose of crash_exclude_mem_range() is to remove all memory ranges that overlap with [mstart-mend]. However, the current logic only removes the first overlapping memory range. Commit a2e9a95d2190 ("kexec: Improve & fix crash_exclude_mem_range() to handle overlapping ranges") attempted to address this issue, but it did not fix all error cases. Let's fix and simplify the logic of crash_exclude_mem_range(). Link: https://lkml.kernel.org/r/20240102144905.110047-4-ytcoode@gmail.com Signed-off-by: Yuntao Wang Acked-by: Baoquan He Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: Dave Young Cc: Hari Bathini Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Sourabh Jain Cc: Takashi Iwai Cc: Thomas Gleixner Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_core.c | 80 +++++++++++++++++++---------------------------------- 1 file changed, 29 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 6f074e112c1e..62e0227d390e 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -566,9 +566,8 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend) { - int i, j; + int i; unsigned long long start, end, p_start, p_end; - struct range temp_range = {0, 0}; for (i = 0; i < mem->nr_ranges; i++) { start = mem->ranges[i].start; @@ -576,72 +575,51 @@ int crash_exclude_mem_range(struct crash_mem *mem, p_start = mstart; p_end = mend; - if (mstart > end || mend < start) + if (p_start > end) continue; + /* + * Because the memory ranges in mem->ranges are stored in + * ascending order, when we detect `p_end < start`, we can + * immediately exit the for loop, as the subsequent memory + * ranges will definitely be outside the range we are looking + * for. + */ + if (p_end < start) + break; + /* Truncate any area outside of range */ - if (mstart < start) + if (p_start < start) p_start = start; - if (mend > end) + if (p_end > end) p_end = end; /* Found completely overlapping range */ if (p_start == start && p_end == end) { - mem->ranges[i].start = 0; - mem->ranges[i].end = 0; - if (i < mem->nr_ranges - 1) { - /* Shift rest of the ranges to left */ - for (j = i; j < mem->nr_ranges - 1; j++) { - mem->ranges[j].start = - mem->ranges[j+1].start; - mem->ranges[j].end = - mem->ranges[j+1].end; - } - - /* - * Continue to check if there are another overlapping ranges - * from the current position because of shifting the above - * mem ranges. - */ - i--; - mem->nr_ranges--; - continue; - } + memmove(&mem->ranges[i], &mem->ranges[i + 1], + (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i])); + i--; mem->nr_ranges--; - return 0; - } - - if (p_start > start && p_end < end) { + } else if (p_start > start && p_end < end) { /* Split original range */ + if (mem->nr_ranges >= mem->max_nr_ranges) + return -ENOMEM; + + memmove(&mem->ranges[i + 2], &mem->ranges[i + 1], + (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i])); + mem->ranges[i].end = p_start - 1; - temp_range.start = p_end + 1; - temp_range.end = end; + mem->ranges[i + 1].start = p_end + 1; + mem->ranges[i + 1].end = end; + + i++; + mem->nr_ranges++; } else if (p_start != start) mem->ranges[i].end = p_start - 1; else mem->ranges[i].start = p_end + 1; - break; - } - - /* If a split happened, add the split to array */ - if (!temp_range.end) - return 0; - - /* Split happened */ - if (i == mem->max_nr_ranges - 1) - return -ENOMEM; - - /* Location where new range should go */ - j = i + 1; - if (j < mem->nr_ranges) { - /* Move over all ranges one slot towards the end */ - for (i = mem->nr_ranges - 1; i >= j; i--) - mem->ranges[i + 1] = mem->ranges[i]; } - mem->ranges[j].start = temp_range.start; - mem->ranges[j].end = temp_range.end; - mem->nr_ranges++; return 0; } -- cgit v1.2.3