From aa8d89d1472b010cbcda7288a4da76e4852a260d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 2 Apr 2025 22:33:26 -0700 Subject: memcg: vmalloc: simplify MEMCG_VMALLOC updates The vmalloc region can either be charged to a single memcg or none. At the moment kernel traverses all the pages backing the vmalloc region to update the MEMCG_VMALLOC stat. However there is no need to look at all the pages as all those pages will be charged to a single memcg or none. Simplify the MEMCG_VMALLOC update by just looking at the first page of the vmalloc region. [shakeel.butt@linux.dev: add comment] Link: https://lkml.kernel.org/r/bmlkdbqgwboyqrnxyom7n52fjmo76ux77jhqw5odc6c6dfon3h@zdylwtmlywbt Link: https://lkml.kernel.org/r/20250403053326.26860-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Yosry Ahmed Cc: Johannes Weiner Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/vmalloc.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2d7511654831..44b735a4573a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3371,12 +3371,13 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); + /* All pages of vm should be charged to same memcg, so use first one. */ + if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES)) + mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; BUG_ON(!page); - if (!(vm->flags & VM_MAP_PUT_PAGES)) - mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations @@ -3672,12 +3673,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (gfp_mask & __GFP_ACCOUNT) { - int i; - - for (i = 0; i < area->nr_pages; i++) - mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1); - } + /* All pages of vm should be charged to same memcg, so use first one. */ + if (gfp_mask & __GFP_ACCOUNT && area->nr_pages) + mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC, + area->nr_pages); /* * If not enough pages were obtained to accomplish an -- cgit v1.2.3 From 4318255091eab6722bea3816bdee09ac7db68035 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 8 Apr 2025 17:15:47 +0200 Subject: vmalloc: add for_each_vmap_node() helper To simplify iteration over vmap-nodes, add the for_each_vmap_node() macro that iterates over all nodes in a system. It tends to simplify the code. Link: https://lkml.kernel.org/r/20250408151549.77937-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Christop Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 44b735a4573a..55949360a5cb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -900,6 +900,11 @@ static struct vmap_node *vmap_nodes = &single; static __read_mostly unsigned int nr_vmap_nodes = 1; static __read_mostly unsigned int vmap_zone_size = 1; +/* A simple iterator over all vmap-nodes. */ +#define for_each_vmap_node(vn) \ + for ((vn) = &vmap_nodes[0]; \ + (vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++) + static inline unsigned int addr_to_node_id(unsigned long addr) { -- cgit v1.2.3 From ce906d7679e1f1feab54bdddac4d39df846d663a Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 8 Apr 2025 17:15:48 +0200 Subject: vmalloc: switch to for_each_vmap_node() helper There are places which can be updated easily to use the helper to iterate over all vmap-nodes. This is what this patch does. The aim is to improve readability and simplify the code. [akpm@linux-foundation.org: fix build warning] Link: https://lkml.kernel.org/r/20250408151549.77937-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Christop Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 55949360a5cb..e80783fa4d2c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1061,12 +1061,11 @@ find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va) { unsigned long va_start_lowest; struct vmap_node *vn; - int i; repeat: - for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + va_start_lowest = 0; + for_each_vmap_node(vn) { spin_lock(&vn->busy.lock); *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root); @@ -4963,11 +4962,8 @@ static void show_purge_info(struct seq_file *m) { struct vmap_node *vn; struct vmap_area *va; - int i; - - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + for_each_vmap_node(vn) { spin_lock(&vn->lazy.lock); list_for_each_entry(va, &vn->lazy.head, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", @@ -4983,11 +4979,8 @@ static int vmalloc_info_show(struct seq_file *m, void *p) struct vmap_node *vn; struct vmap_area *va; struct vm_struct *v; - int i; - - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + for_each_vmap_node(vn) { spin_lock(&vn->busy.lock); list_for_each_entry(va, &vn->busy.head, list) { if (!va->vm) { @@ -5108,7 +5101,7 @@ static void __init vmap_init_free_space(void) static void vmap_init_nodes(void) { struct vmap_node *vn; - int i, n; + int i; #if BITS_PER_LONG == 64 /* @@ -5125,7 +5118,7 @@ static void vmap_init_nodes(void) * set of cores. Therefore a per-domain purging is supposed to * be added as well as a per-domain balancing. */ - n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); + int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); if (n > 1) { vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN); @@ -5140,8 +5133,7 @@ static void vmap_init_nodes(void) } #endif - for (n = 0; n < nr_vmap_nodes; n++) { - vn = &vmap_nodes[n]; + for_each_vmap_node(vn) { vn->busy.root = RB_ROOT; INIT_LIST_HEAD(&vn->busy.head); spin_lock_init(&vn->busy.lock); @@ -5162,15 +5154,13 @@ static void vmap_init_nodes(void) static unsigned long vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - unsigned long count; + unsigned long count = 0; struct vmap_node *vn; - int i, j; - - for (count = 0, i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + int i; - for (j = 0; j < MAX_VA_SIZE_PAGES; j++) - count += READ_ONCE(vn->pool[j].len); + for_each_vmap_node(vn) { + for (i = 0; i < MAX_VA_SIZE_PAGES; i++) + count += READ_ONCE(vn->pool[i].len); } return count ? count : SHRINK_EMPTY; @@ -5179,10 +5169,10 @@ vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) static unsigned long vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { - int i; + struct vmap_node *vn; - for (i = 0; i < nr_vmap_nodes; i++) - decay_va_pool_node(&vmap_nodes[i], true); + for_each_vmap_node(vn) + decay_va_pool_node(vn, true); return SHRINK_STOP; } -- cgit v1.2.3 From 24c76f37ab3fb2f3a202eaa678334b216f8bd161 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 8 Apr 2025 17:15:49 +0200 Subject: vmalloc: use for_each_vmap_node() in purge-vmap-area Update a __purge_vmap_area_lazy() to use introduced helper. This is last place in vmalloc code. Also this patch introduces an extra function which is node_to_id() that converts a vmap_node pointer to an index in array. __purge_vmap_area_lazy() requires that extra function. Link: https://lkml.kernel.org/r/20250408151549.77937-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Christop Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e80783fa4d2c..9eb3880887b6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -923,6 +923,19 @@ id_to_node(unsigned int id) return &vmap_nodes[id % nr_vmap_nodes]; } +static inline unsigned int +node_to_id(struct vmap_node *node) +{ + /* Pointer arithmetic. */ + unsigned int id = node - vmap_nodes; + + if (likely(id < nr_vmap_nodes)) + return id; + + WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node); + return 0; +} + /* * We use the value 0 to represent "no node", that is why * an encoded value will be the node-id incremented by 1. @@ -2259,9 +2272,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, */ purge_nodes = CPU_MASK_NONE; - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; - + for_each_vmap_node(vn) { INIT_LIST_HEAD(&vn->purge_list); vn->skip_populate = full_pool_decay; decay_va_pool_node(vn, full_pool_decay); @@ -2280,7 +2291,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, end = max(end, list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end); - cpumask_set_cpu(i, &purge_nodes); + cpumask_set_cpu(node_to_id(vn), &purge_nodes); } nr_purge_nodes = cpumask_weight(&purge_nodes); -- cgit v1.2.3 From 0272d07ef6ebc574e62d6423e5efa5b975f9f910 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 15 Apr 2025 13:26:46 +0200 Subject: vmalloc: use atomic_long_add_return_relaxed() Switch from the atomic_long_add_return() to its relaxed version. We do not need a full memory barrier or any memory ordering during increasing the "vmap_lazy_nr" variable. What we only need is to do it atomically. This is what atomic_long_add_return_relaxed() guarantees. AARCH64: Default: 40ec: d34cfe94 lsr x20, x20, #12 40f0: 14000044 b 4200 40f4: 94000000 bl 0 <__sanitizer_cov_trace_pc> 40f8: 90000000 adrp x0, 0 <__traceiter_alloc_vmap_area> 40fc: 91000000 add x0, x0, #0x0 4100: f8f40016 ldaddal x20, x22, [x0] 4104: 8b160296 add x22, x20, x22 Relaxed: 40ec: d34cfe94 lsr x20, x20, #12 40f0: 14000044 b 4200 40f4: 94000000 bl 0 <__sanitizer_cov_trace_pc> 40f8: 90000000 adrp x0, 0 <__traceiter_alloc_vmap_area> 40fc: 91000000 add x0, x0, #0x0 4100: f8340016 ldadd x20, x22, [x0] 4104: 8b160296 add x22, x20, x22 Link: https://lkml.kernel.org/r/20250415112646.113091-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Cc: Christop Hellwig Cc: Mateusz Guzik Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9eb3880887b6..dc33ebeb8b1b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2370,7 +2370,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) if (WARN_ON_ONCE(!list_empty(&va->list))) return; - nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT, + nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT, &vmap_lazy_nr); /* -- cgit v1.2.3 From d0966120486833cd837feea9917b7ed06e74f58c Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 17 Apr 2025 18:12:16 +0200 Subject: vmalloc: align nr_vmalloc_pages and vmap_lazy_nr Currently both atomics share one cache-line: ... ffffffff83eab400 b vmap_lazy_nr ffffffff83eab408 b nr_vmalloc_pages ... those are global variables and they are only 8 bytes apart. Since they are modified by different threads this causes a false sharing. This can lead to a performance drop due to unnecessary cache invalidations. After this patch it is aligned to a cache line boundary: ... ffffffff8260a600 d vmap_lazy_nr ffffffff8260a640 d nr_vmalloc_pages ... Link: https://lkml.kernel.org/r/20250417161216.88318-4-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Reviewed-by: Adrian Huang Tested-by: Adrian Huang Cc: Mateusz Guzik Cc: Christop Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index dc33ebeb8b1b..3fd802134e4e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1008,7 +1008,8 @@ static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static void drain_vmap_area_work(struct work_struct *work); static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); -static atomic_long_t nr_vmalloc_pages; +static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages; +static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr; unsigned long vmalloc_nr_pages(void) { @@ -2117,8 +2118,6 @@ static unsigned long lazy_max_pages(void) return log * (32UL * 1024 * 1024 / PAGE_SIZE); } -static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); - /* * Serialize vmap purging. There is no actual critical section protected * by this lock, but we want to avoid concurrent calls for performance -- cgit v1.2.3 From f7f68274e476c49e29bfb81daf4ad717fe9880c6 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sat, 19 Apr 2025 06:36:49 +0800 Subject: mm/vmalloc.c: change purge_ndoes as local static variable Patch series "mm/vmalloc.c: code cleanup and improvements", v2. These changes were made from code inspection in mm/vmalloc.c. This patch (of 5): Static variable 'purge_ndoes' is defined in global scope, while it's only used in function __purge_vmap_area_lazy(). It mainly serves to avoid memory allocation repeatedly, especially when NR_CPUS is big. While a local static variable can also satisfy the demand, and can improve code readibility. Hence move its definition into __purge_vmap_area_lazy(). Link: https://lkml.kernel.org/r/20250418223653.243436-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20250418223653.243436-2-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Shivank Garg Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3fd802134e4e..cd733387b1a0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2127,7 +2127,6 @@ static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); -static cpumask_t purge_nodes; static void reclaim_list_global(struct list_head *head) @@ -2260,6 +2259,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, { unsigned long nr_purged_areas = 0; unsigned int nr_purge_helpers; + static cpumask_t purge_nodes; unsigned int nr_purge_nodes; struct vmap_node *vn; int i; -- cgit v1.2.3 From 81262d85aef4dbdd878d6ce982fe51c06ed2720c Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sat, 19 Apr 2025 06:36:50 +0800 Subject: mm/vmalloc.c: find the vmap of vmap_nodes in reverse order When finding VA in vn->busy, if VA spans several zones and the passed addr is not the same as va->va_start, we should scan the vn in reverse odrdr because the starting address of VA must be smaller than the passed addr if it really resides in the VA. E.g on a system nr_vmap_nodes=100, <----va----> -|-----|-----|-----|-----|-----|-----|-----|-----|-----|- ... n-1 n n+1 n+2 ... 100 0 1 VA resides in node 'n' whereas it spans 'n', 'n+1' and 'n+2'. If passed addr is within 'n+2', we should try nodes backwards on 'n+1' and 'n', then succeed very soon. Meanwhile we still need loop around because VA could spans node from 'n' to node 100, node 0, node 1. Anyway, changing to find in reverse order can improve efficiency on many CPUs system. Link: https://lkml.kernel.org/r/20250418223653.243436-3-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Shivank Garg Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cd733387b1a0..26d86ab5e59e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2435,7 +2435,7 @@ struct vmap_area *find_vmap_area(unsigned long addr) if (va) return va; - } while ((i = (i + 1) % nr_vmap_nodes) != j); + } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j); return NULL; } @@ -2461,7 +2461,7 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr) if (va) return va; - } while ((i = (i + 1) % nr_vmap_nodes) != j); + } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j); return NULL; } -- cgit v1.2.3 From 4f05024eba021f1885c044c8515b6a2674b652de Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sat, 19 Apr 2025 06:36:51 +0800 Subject: mm/vmalloc.c: optimize code in decay_va_pool_node() a little bit When purge lazily freed vmap areas, VA stored in vn->pool[] will also be taken away into free vmap tree partially or completely accordingly, that is done in decay_va_pool_node(). When doing that, for each pool of node, the whole list is detached from the pool for handling. At this time, that pool is empty. It's not necessary to update the pool size each time when one VA is removed and addded into free vmap tree. Here change code to update the pool size when attaching the pool back. Link: https://lkml.kernel.org/r/20250418223653.243436-4-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Uladzislau Rezki (Sony) Cc: Shivank Garg Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 26d86ab5e59e..e58c2ed4bcba 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2149,7 +2149,7 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) LIST_HEAD(decay_list); struct rb_root decay_root = RB_ROOT; struct vmap_area *va, *nva; - unsigned long n_decay; + unsigned long n_decay, pool_len; int i; for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { @@ -2163,22 +2163,20 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) list_replace_init(&vn->pool[i].head, &tmp_list); spin_unlock(&vn->pool_lock); - if (full_decay) - WRITE_ONCE(vn->pool[i].len, 0); + pool_len = n_decay = vn->pool[i].len; + WRITE_ONCE(vn->pool[i].len, 0); /* Decay a pool by ~25% out of left objects. */ - n_decay = vn->pool[i].len >> 2; + if (!full_decay) + n_decay >>= 2; + pool_len -= n_decay; list_for_each_entry_safe(va, nva, &tmp_list, list) { + if (!n_decay--) + break; + list_del_init(&va->list); merge_or_add_vmap_area(va, &decay_root, &decay_list); - - if (!full_decay) { - WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1); - - if (!--n_decay) - break; - } } /* @@ -2187,9 +2185,10 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) * can populate the pool therefore a simple list replace * operation takes place here. */ - if (!full_decay && !list_empty(&tmp_list)) { + if (!list_empty(&tmp_list)) { spin_lock(&vn->pool_lock); list_replace_init(&tmp_list, &vn->pool[i].head); + WRITE_ONCE(vn->pool[i].len, pool_len); spin_unlock(&vn->pool_lock); } } -- cgit v1.2.3 From 8ab8442d44ee37cc21fdc11530158ee2b2be5ed5 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sat, 19 Apr 2025 06:36:52 +0800 Subject: mm/vmalloc: optimize function vm_unmap_aliases() Remove unneeded local variables and replace them with values. Link: https://lkml.kernel.org/r/20250418223653.243436-5-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Shivank Garg Signed-off-by: Andrew Morton --- mm/vmalloc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e58c2ed4bcba..bfc2f2c45d89 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2929,10 +2929,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) */ void vm_unmap_aliases(void) { - unsigned long start = ULONG_MAX, end = 0; - int flush = 0; - - _vm_unmap_aliases(start, end, flush); + _vm_unmap_aliases(ULONG_MAX, 0, 0); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); -- cgit v1.2.3 From b25f97d0f8043903a64e3a203b79219cc69f2f77 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sat, 19 Apr 2025 06:36:53 +0800 Subject: mm/vmalloc.c: return explicit error value in alloc_vmap_area() In codes of alloc_vmap_area(), it returns the upper bound 'vend' to indicate if the allocation is successful or failed. That is not very clear. Here change to return explicit error values and check them to judge if allocation is successful. Link: https://lkml.kernel.org/r/20250418223653.243436-6-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Shivank Garg Reviewed-by: Uladzislau Rezki (Sony) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bfc2f2c45d89..701ea4ec8950 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1716,7 +1716,7 @@ va_clip(struct rb_root *root, struct list_head *head, */ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); if (!lva) - return -1; + return -ENOMEM; } /* @@ -1730,7 +1730,7 @@ va_clip(struct rb_root *root, struct list_head *head, */ va->va_start = nva_start_addr + size; } else { - return -1; + return -EINVAL; } if (type != FL_FIT_TYPE) { @@ -1759,19 +1759,19 @@ va_alloc(struct vmap_area *va, /* Check the "vend" restriction. */ if (nva_start_addr + size > vend) - return vend; + return -ERANGE; /* Update the free vmap_area. */ ret = va_clip(root, head, va, nva_start_addr, size); if (WARN_ON_ONCE(ret)) - return vend; + return ret; return nva_start_addr; } /* * Returns a start address of the newly allocated area, if success. - * Otherwise a vend is returned that indicates failure. + * Otherwise an error value is returned that indicates failure. */ static __always_inline unsigned long __alloc_vmap_area(struct rb_root *root, struct list_head *head, @@ -1796,14 +1796,13 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size); if (unlikely(!va)) - return vend; + return -ENOENT; nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend); - if (nva_start_addr == vend) - return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK - find_vmap_lowest_match_check(root, head, size, align); + if (!IS_ERR_VALUE(nva_start_addr)) + find_vmap_lowest_match_check(root, head, size, align); #endif return nva_start_addr; @@ -1933,7 +1932,7 @@ node_alloc(unsigned long size, unsigned long align, struct vmap_area *va; *vn_id = 0; - *addr = vend; + *addr = -EINVAL; /* * Fallback to a global heap if not vmalloc or there @@ -2013,20 +2012,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, } retry: - if (addr == vend) { + if (IS_ERR_VALUE(addr)) { preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); } - trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); + trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr)); /* - * If an allocation fails, the "vend" address is + * If an allocation fails, the error value is * returned. Therefore trigger the overflow path. */ - if (unlikely(addr == vend)) + if (IS_ERR_VALUE(addr)) goto overflow; va->va_start = addr; -- cgit v1.2.3 From 5c5f0468d172ddec2e333d738d2a1f85402cf0bc Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Fri, 9 May 2025 01:56:20 +0900 Subject: mm/vmalloc: fix data race in show_numa_info() The following data-race was found in show_numa_info(): ================================================================== BUG: KCSAN: data-race in vmalloc_info_show / vmalloc_info_show read to 0xffff88800971fe30 of 4 bytes by task 8289 on cpu 0: show_numa_info mm/vmalloc.c:4936 [inline] vmalloc_info_show+0x5a8/0x7e0 mm/vmalloc.c:5016 seq_read_iter+0x373/0xb40 fs/seq_file.c:230 proc_reg_read_iter+0x11e/0x170 fs/proc/inode.c:299 .... write to 0xffff88800971fe30 of 4 bytes by task 8287 on cpu 1: show_numa_info mm/vmalloc.c:4934 [inline] vmalloc_info_show+0x38f/0x7e0 mm/vmalloc.c:5016 seq_read_iter+0x373/0xb40 fs/seq_file.c:230 proc_reg_read_iter+0x11e/0x170 fs/proc/inode.c:299 .... value changed: 0x0000008f -> 0x00000000 ================================================================== According to this report,there is a read/write data-race because m->private is accessible to multiple CPUs. To fix this, instead of allocating the heap in proc_vmalloc_init() and passing the heap address to m->private, vmalloc_info_show() should allocate the heap. Link: https://lkml.kernel.org/r/20250508165620.15321-1-aha310510@gmail.com Fixes: 8e1d743f2c26 ("mm: vmalloc: support multiple nodes in vmallocinfo") Signed-off-by: Jeongjun Park Suggested-by: Eric Dumazet Suggested-by: Andrew Morton Reviewed-by: "Uladzislau Rezki (Sony)" Signed-off-by: Andrew Morton --- mm/vmalloc.c | 63 +++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 28 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 701ea4ec8950..49df04e1fbe1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3109,7 +3109,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm) /* * Before removing VM_UNINITIALIZED, * we should make sure that vm has proper values. - * Pair with smp_rmb() in show_numa_info(). + * Pair with smp_rmb() in vread_iter() and vmalloc_info_show(). */ smp_wmb(); vm->flags &= ~VM_UNINITIALIZED; @@ -4939,28 +4939,29 @@ bool vmalloc_dump_obj(void *object) #endif #ifdef CONFIG_PROC_FS -static void show_numa_info(struct seq_file *m, struct vm_struct *v) -{ - if (IS_ENABLED(CONFIG_NUMA)) { - unsigned int nr, *counters = m->private; - unsigned int step = 1U << vm_area_page_order(v); - if (!counters) - return; +/* + * Print number of pages allocated on each memory node. + * + * This function can only be called if CONFIG_NUMA is enabled + * and VM_UNINITIALIZED bit in v->flags is disabled. + */ +static void show_numa_info(struct seq_file *m, struct vm_struct *v, + unsigned int *counters) +{ + unsigned int nr; + unsigned int step = 1U << vm_area_page_order(v); - if (v->flags & VM_UNINITIALIZED) - return; - /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ - smp_rmb(); + if (!counters) + return; - memset(counters, 0, nr_node_ids * sizeof(unsigned int)); + memset(counters, 0, nr_node_ids * sizeof(unsigned int)); - for (nr = 0; nr < v->nr_pages; nr += step) - counters[page_to_nid(v->pages[nr])] += step; - for_each_node_state(nr, N_HIGH_MEMORY) - if (counters[nr]) - seq_printf(m, " N%u=%u", nr, counters[nr]); - } + for (nr = 0; nr < v->nr_pages; nr += step) + counters[page_to_nid(v->pages[nr])] += step; + for_each_node_state(nr, N_HIGH_MEMORY) + if (counters[nr]) + seq_printf(m, " N%u=%u", nr, counters[nr]); } static void show_purge_info(struct seq_file *m) @@ -4984,6 +4985,10 @@ static int vmalloc_info_show(struct seq_file *m, void *p) struct vmap_node *vn; struct vmap_area *va; struct vm_struct *v; + unsigned int *counters; + + if (IS_ENABLED(CONFIG_NUMA)) + counters = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); for_each_vmap_node(vn) { spin_lock(&vn->busy.lock); @@ -4998,6 +5003,11 @@ static int vmalloc_info_show(struct seq_file *m, void *p) } v = va->vm; + if (v->flags & VM_UNINITIALIZED) + continue; + + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); @@ -5032,7 +5042,9 @@ static int vmalloc_info_show(struct seq_file *m, void *p) if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages"); - show_numa_info(m, v); + if (IS_ENABLED(CONFIG_NUMA)) + show_numa_info(m, v, counters); + seq_putc(m, '\n'); } spin_unlock(&vn->busy.lock); @@ -5042,19 +5054,14 @@ static int vmalloc_info_show(struct seq_file *m, void *p) * As a final step, dump "unpurged" areas. */ show_purge_info(m); + if (IS_ENABLED(CONFIG_NUMA)) + kfree(counters); return 0; } static int __init proc_vmalloc_init(void) { - void *priv_data = NULL; - - if (IS_ENABLED(CONFIG_NUMA)) - priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); - - proc_create_single_data("vmallocinfo", - 0400, NULL, vmalloc_info_show, priv_data); - + proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show); return 0; } module_init(proc_vmalloc_init); -- cgit v1.2.3