From aa807b9f22df2eee28593cbbabba0f93f4aa26c1 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 12 Jun 2025 10:14:17 +0800 Subject: dma-contiguous: hornor the cma address limit setup by user When porting a cma related usage from x86_64 server to arm64 server, the "cma=4G@4G" setup failed on arm64. The reason is arm64 and some other architectures have specific physical address limit for reserved cma area, like 4GB due to the device's need for 32 bit dma. Actually lots of platforms of those architectures don't have this device dma limit, but still have to obey it, and are not able to reserve a huge cma pool. This situation could be improved by honoring the user input cma physical address than the arch limit. As when users specify it, they already knows what the default is which probably can't suit them. Suggested-by: Robin Murphy Signed-off-by: Feng Tang Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20250612021417.44929-1-feng.tang@linux.alibaba.com --- kernel/dma/contiguous.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 8df0dfaaca18..67af8a55185d 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -222,7 +222,10 @@ void __init dma_contiguous_reserve(phys_addr_t limit) if (size_cmdline != -1) { selected_size = size_cmdline; selected_base = base_cmdline; - selected_limit = min_not_zero(limit_cmdline, limit); + + /* Hornor the user setup dma address limit */ + selected_limit = limit_cmdline ?: limit; + if (base_cmdline + size_cmdline == limit_cmdline) fixed = true; } else { -- cgit v1.2.3 From ca3881f6fd8e9b6eb2d51e8718d07d3b8029d886 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Wed, 18 Jun 2025 14:26:26 +0200 Subject: module: Fix memory deallocation on error path in move_module() The function move_module() uses the variable t to track how many memory types it has allocated and consequently how many should be freed if an error occurs. The variable is initially set to 0 and is updated when a call to module_memory_alloc() fails. However, move_module() can fail for other reasons as well, in which case t remains set to 0 and no memory is freed. Fix the problem by initializing t to MOD_MEM_NUM_TYPES. Additionally, make the deallocation loop more robust by not relying on the mod_mem_type_t enum having a signed integer as its underlying type. Fixes: c7ee8aebf6c0 ("module: add stop-grap sanity check on module memcpy()") Signed-off-by: Petr Pavlu Reviewed-by: Sami Tolvanen Reviewed-by: Daniel Gomez Link: https://lore.kernel.org/r/20250618122730.51324-2-petr.pavlu@suse.com Signed-off-by: Daniel Gomez Message-ID: <20250618122730.51324-2-petr.pavlu@suse.com> --- kernel/module/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 413ac6ea3702..9ac994b2f354 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2697,7 +2697,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) static int move_module(struct module *mod, struct load_info *info) { int i; - enum mod_mem_type t = 0; + enum mod_mem_type t = MOD_MEM_NUM_TYPES; int ret = -ENOMEM; bool codetag_section_found = false; @@ -2776,7 +2776,7 @@ static int move_module(struct module *mod, struct load_info *info) return 0; out_err: module_memory_restore_rox(mod); - for (t--; t >= 0; t--) + while (t--) module_memory_free(mod, t); if (codetag_section_found) codetag_free_module_sections(mod); -- cgit v1.2.3 From eb0994a954978f0edd3efb38d0cbe6744df8b83d Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Wed, 18 Jun 2025 14:26:27 +0200 Subject: module: Avoid unnecessary return value initialization in move_module() All error conditions in move_module() set the return value by updating the ret variable. Therefore, it is not necessary to the initialize the variable when declaring it. Remove the unnecessary initialization. Signed-off-by: Petr Pavlu Reviewed-by: Sami Tolvanen Reviewed-by: Daniel Gomez Link: https://lore.kernel.org/r/20250618122730.51324-3-petr.pavlu@suse.com Signed-off-by: Daniel Gomez Message-ID: <20250618122730.51324-3-petr.pavlu@suse.com> --- kernel/module/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 9ac994b2f354..7822b91fca6b 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2696,9 +2696,8 @@ static int find_module_sections(struct module *mod, struct load_info *info) static int move_module(struct module *mod, struct load_info *info) { - int i; + int i, ret; enum mod_mem_type t = MOD_MEM_NUM_TYPES; - int ret = -ENOMEM; bool codetag_section_found = false; for_each_mod_mem_type(type) { -- cgit v1.2.3 From 570db4b39f535a8bb722adb8be0280d09e34ca99 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 10 Jun 2025 18:33:28 +0200 Subject: module: Make sure relocations are applied to the per-CPU section The per-CPU data section is handled differently than the other sections. The memory allocations requires a special __percpu pointer and then the section is copied into the view of each CPU. Therefore the SHF_ALLOC flag is removed to ensure move_module() skips it. Later, relocations are applied and apply_relocations() skips sections without SHF_ALLOC because they have not been copied. This also skips the per-CPU data section. The missing relocations result in a NULL pointer on x86-64 and very small values on x86-32. This results in a crash because it is not skipped like NULL pointer would and can't be dereferenced. Such an assignment happens during static per-CPU lock initialisation with lockdep enabled. Allow relocation processing for the per-CPU section even if SHF_ALLOC is missing. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202506041623.e45e4f7d-lkp@intel.com Fixes: 1a6100caae425 ("Don't relocate non-allocated regions in modules.") #v2.6.1-rc3 Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20250610163328.URcsSUC1@linutronix.de Signed-off-by: Daniel Gomez Message-ID: <20250610163328.URcsSUC1@linutronix.de> --- kernel/module/main.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 7822b91fca6b..c2c08007029d 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1573,8 +1573,14 @@ static int apply_relocations(struct module *mod, const struct load_info *info) if (infosec >= info->hdr->e_shnum) continue; - /* Don't bother with non-allocated sections */ - if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) + /* + * Don't bother with non-allocated sections. + * An exception is the percpu section, which has separate allocations + * for individual CPUs. We relocate the percpu section in the initial + * ELF template and subsequently copy it to the per-CPU destinations. + */ + if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC) && + (!infosec || infosec != info->index.pcpu)) continue; if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH) -- cgit v1.2.3 From 3da6bb419750f3ad834786d6ba7c9d5d062c770b Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 9 Jul 2025 20:27:52 +0900 Subject: perf/core: Fix WARN in perf_sigtrap() Since exit_task_work() runs after perf_event_exit_task_context() updated ctx->task to TASK_TOMBSTONE, perf_sigtrap() from perf_pending_task() might observe event->ctx->task == TASK_TOMBSTONE. Swap the early exit tests in order not to hit WARN_ON_ONCE(). Closes: https://syzkaller.appspot.com/bug?extid=2fe61cb2a86066be6985 Reported-by: syzbot Signed-off-by: Tetsuo Handa Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/b1c224bd-97f9-462c-a3e3-125d5e19c983@I-love.SAKURA.ne.jp --- kernel/events/core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0db36b2b2448..22fdf0c187cd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7204,18 +7204,18 @@ void perf_event_wakeup(struct perf_event *event) static void perf_sigtrap(struct perf_event *event) { /* - * We'd expect this to only occur if the irq_work is delayed and either - * ctx->task or current has changed in the meantime. This can be the - * case on architectures that do not implement arch_irq_work_raise(). + * Both perf_pending_task() and perf_pending_irq() can race with the + * task exiting. */ - if (WARN_ON_ONCE(event->ctx->task != current)) + if (current->flags & PF_EXITING) return; /* - * Both perf_pending_task() and perf_pending_irq() can race with the - * task exiting. + * We'd expect this to only occur if the irq_work is delayed and either + * ctx->task or current has changed in the meantime. This can be the + * case on architectures that do not implement arch_irq_work_raise(). */ - if (current->flags & PF_EXITING) + if (WARN_ON_ONCE(event->ctx->task != current)) return; send_sig_perf((void __user *)event->pending_addr, -- cgit v1.2.3 From db6cc3f4ac2e6cdc898fc9cbc8b32ae1bf56bdad Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 4 Jul 2025 21:56:20 +0800 Subject: Revert "sched/numa: add statistics of numa balance task" This reverts commit ad6b26b6a0a79166b53209df2ca1cf8636296382. This commit introduces per-memcg/task NUMA balance statistics, but unfortunately it introduced a NULL pointer exception due to the following race condition: After a swap task candidate was chosen, its mm_struct pointer was set to NULL due to task exit. Later, when performing the actual task swapping, the p->mm caused the problem. CPU0 CPU1 : ... task_numa_migrate task_numa_find_cpu task_numa_compare # a normal task p is chosen env->best_task = p # p exit: exit_signals(p); p->flags |= PF_EXITING exit_mm p->mm = NULL; migrate_swap_stop __migrate_swap_task((arg->src_task, arg->dst_cpu) count_memcg_event_mm(p->mm, NUMA_TASK_SWAP)# p->mm is NULL task_lock() should be held and the PF_EXITING flag needs to be checked to prevent this from happening. After discussion, the conclusion was that adding a lock is not worthwhile for some statistics calculations. Revert the change and rely on the tracepoint for this purpose. Link: https://lkml.kernel.org/r/20250704135620.685752-1-yu.c.chen@intel.com Link: https://lkml.kernel.org/r/20250708064917.BBD13C4CEED@smtp.kernel.org Fixes: ad6b26b6a0a7 ("sched/numa: add statistics of numa balance task") Signed-off-by: Chen Yu Reported-by: Jirka Hladky Closes: https://lore.kernel.org/all/CAE4VaGBLJxpd=NeRJXpSCuw=REhC5LWJpC29kDy-Zh2ZDyzQZA@mail.gmail.com/ Reported-by: Srikanth Aithal Reported-by: Suneeth D Acked-by: Michal Hocko Cc: Borislav Petkov Cc: Ingo Molnar Cc: Jiri Hladky Cc: Libo Chen Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 6 ------ include/linux/sched.h | 4 ---- include/linux/vm_event_item.h | 2 -- kernel/sched/core.c | 9 ++------- kernel/sched/debug.c | 4 ---- mm/memcontrol.c | 2 -- mm/vmstat.c | 2 -- 7 files changed, 2 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 0cc35a14afbe..bd98ea3175ec 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1732,12 +1732,6 @@ The following nested keys are defined. numa_hint_faults (npn) Number of NUMA hinting faults. - numa_task_migrated (npn) - Number of task migration by NUMA balancing. - - numa_task_swapped (npn) - Number of task swap by NUMA balancing. - pgdemote_kswapd Number of pages demoted by kswapd. diff --git a/include/linux/sched.h b/include/linux/sched.h index 4f78a64beb52..aa9c5be7a632 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -548,10 +548,6 @@ struct sched_statistics { u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; -#ifdef CONFIG_NUMA_BALANCING - u64 numa_task_migrated; - u64 numa_task_swapped; -#endif u64 nr_wakeups; u64 nr_wakeups_sync; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 91a3ce9a2687..9e15a088ba38 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -66,8 +66,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, NUMA_HINT_FAULTS, NUMA_HINT_FAULTS_LOCAL, NUMA_PAGE_MIGRATE, - NUMA_TASK_MIGRATE, - NUMA_TASK_SWAP, #endif #ifdef CONFIG_MIGRATION PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ec68fc686bd7..81c6df746df1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3362,10 +3362,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { - __schedstat_inc(p->stats.numa_task_swapped); - count_vm_numa_event(NUMA_TASK_SWAP); - count_memcg_event_mm(p->mm, NUMA_TASK_SWAP); - if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; struct rq_flags srf, drf; @@ -7939,9 +7935,8 @@ int migrate_task_to(struct task_struct *p, int target_cpu) if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) return -EINVAL; - __schedstat_inc(p->stats.numa_task_migrated); - count_vm_numa_event(NUMA_TASK_MIGRATE); - count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE); + /* TODO: This is not properly updating schedstats */ + trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 9d71baf08075..557246880a7e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1210,10 +1210,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_failed_migrations_running); P_SCHEDSTAT(nr_failed_migrations_hot); P_SCHEDSTAT(nr_forced_migrations); -#ifdef CONFIG_NUMA_BALANCING - P_SCHEDSTAT(numa_task_migrated); - P_SCHEDSTAT(numa_task_swapped); -#endif P_SCHEDSTAT(nr_wakeups); P_SCHEDSTAT(nr_wakeups_sync); P_SCHEDSTAT(nr_wakeups_migrate); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 902da8a9c643..70fdeda1120b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -474,8 +474,6 @@ static const unsigned int memcg_vm_event_stat[] = { NUMA_PAGE_MIGRATE, NUMA_PTE_UPDATES, NUMA_HINT_FAULTS, - NUMA_TASK_MIGRATE, - NUMA_TASK_SWAP, #endif }; diff --git a/mm/vmstat.c b/mm/vmstat.c index 429ae5339bfe..a78d70ddeacd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1346,8 +1346,6 @@ const char * const vmstat_text[] = { "numa_hint_faults", "numa_hint_faults_local", "numa_pages_migrated", - "numa_task_migrated", - "numa_task_swapped", #endif #ifdef CONFIG_MIGRATION "pgmigrate_success", -- cgit v1.2.3