From 54f7a49c20ebb5189980c53e6e66709d22bee572 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 29 May 2024 20:28:20 +1200 Subject: mm: remove the implementation of swap_free() and always use swap_free_nr() To streamline maintenance efforts, we propose removing the implementation of swap_free(). Instead, we can simply invoke swap_free_nr() with nr set to 1. swap_free_nr() is designed with a bitmap consisting of only one long, resulting in overhead that can be ignored for cases where nr equals 1. A prime candidate for leveraging swap_free_nr() lies within kernel/power/swap.c. Implementing this change facilitates the adoption of batch processing for hibernation. Link: https://lkml.kernel.org/r/20240529082824.150954-3-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: "Huang, Ying" Reviewed-by: "Huang, Ying" Acked-by: Chris Li Reviewed-by: Ryan Roberts Cc: "Rafael J. Wysocki" Cc: Pavel Machek Cc: Len Brown Cc: Hugh Dickins Cc: Christoph Hellwig Cc: Andreas Larsson Cc: Baolin Wang Cc: Chuanhua Han Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gao Xiang Cc: Johannes Weiner Cc: Kairui Song Cc: Khalid Aziz Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- kernel/power/swap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 753b8dd42a59..82b884b67152 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -200,12 +200,11 @@ void free_all_swap_pages(int swap) while ((node = swsusp_extents.rb_node)) { struct swsusp_extent *ext; - unsigned long offset; ext = rb_entry(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); - for (offset = ext->start; offset <= ext->end; offset++) - swap_free(swp_entry(swap, offset)); + swap_free_nr(swp_entry(swap, ext->start), + ext->end - ext->start + 1); kfree(ext); } -- cgit v1.2.3 From 76ba6acfcce871db13ad51c6dc8f56fec2e92853 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Thu, 20 Jun 2024 20:21:24 +0800 Subject: mm: optimize the redundant loop of mm_update_owner_next() When mm_update_owner_next() is racing with swapoff (try_to_unuse()) or /proc or ptrace or page migration (get_task_mm()), it is impossible to find an appropriate task_struct in the loop whose mm_struct is the same as the target mm_struct. If the above race condition is combined with the stress-ng-zombie and stress-ng-dup tests, such a long loop can easily cause a Hard Lockup in write_lock_irq() for tasklist_lock. Recognize this situation in advance and exit early. Link: https://lkml.kernel.org/r/20240620122123.3877432-1-alexjlzheng@tencent.com Signed-off-by: Jinliang Zheng Acked-by: Michal Hocko Cc: Christian Brauner Cc: Jens Axboe Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Oleg Nesterov Cc: Tycho Andersen Cc: Signed-off-by: Andrew Morton --- kernel/exit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f95a2c1338a8..81fcee45d630 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -484,6 +484,8 @@ retry: * Search through everything else, we should not get here often. */ for_each_process(g) { + if (atomic_read(&mm->mm_users) <= 1) + break; if (g->flags & PF_KTHREAD) continue; for_each_thread(g, c) { -- cgit v1.2.3 From 15bde4abab734c687c1f81704886aba3a70c268e Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 18 Jun 2024 11:11:35 +1200 Subject: mm: extend rmap flags arguments for folio_add_new_anon_rmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: clarify folio_add_new_anon_rmap() and __folio_add_anon_rmap()", v2. This patchset is preparatory work for mTHP swapin. folio_add_new_anon_rmap() assumes that new anon rmaps are always exclusive. However, this assumption doesn’t hold true for cases like do_swap_page(), where a new anon might be added to the swapcache and is not necessarily exclusive. The patchset extends the rmap flags to allow folio_add_new_anon_rmap() to handle both exclusive and non-exclusive new anon folios. The do_swap_page() function is updated to use this extended API with rmap flags. Consequently, all new anon folios now consistently use folio_add_new_anon_rmap(). The special case for !folio_test_anon() in __folio_add_anon_rmap() can be safely removed. In conclusion, new anon folios always use folio_add_new_anon_rmap(), regardless of exclusivity. Old anon folios continue to use __folio_add_anon_rmap() via folio_add_anon_rmap_pmd() and folio_add_anon_rmap_ptes(). This patch (of 3): In the case of a swap-in, a new anonymous folio is not necessarily exclusive. This patch updates the rmap flags to allow a new anonymous folio to be treated as either exclusive or non-exclusive. To maintain the existing behavior, we always use EXCLUSIVE as the default setting. [akpm@linux-foundation.org: cleanup and constifications per David and akpm] [v-songbaohua@oppo.com: fix missing doc for flags of folio_add_new_anon_rmap()] Link: https://lkml.kernel.org/r/20240619210641.62542-1-21cnbao@gmail.com [v-songbaohua@oppo.com: enhance doc for extend rmap flags arguments for folio_add_new_anon_rmap] Link: https://lkml.kernel.org/r/20240622030256.43775-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240617231137.80726-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240617231137.80726-2-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: David Hildenbrand Tested-by: Shuai Yuan Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Chris Li Cc: "Huang, Ying" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 +- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 2 +- mm/khugepaged.c | 2 +- mm/memory.c | 10 +++++----- mm/migrate_device.c | 2 +- mm/rmap.c | 25 ++++++++++++++++--------- mm/swapfile.c | 2 +- mm/userfaultfd.c | 2 +- 9 files changed, 28 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 980fa5d75d69..0978c64f49d8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -244,7 +244,7 @@ void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, void folio_add_anon_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, - unsigned long address); + unsigned long address, rmap_t flags); void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_add_file_rmap_pte(folio, page, vma) \ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2c83ba776fc7..c20368aa33dd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -181,7 +181,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { folio_get(new_folio); - folio_add_new_anon_rmap(new_folio, vma, addr); + folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 46ba81240d96..14a05c643806 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -974,7 +974,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - folio_add_new_anon_rmap(folio, vma, haddr); + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f8d08b49420c..409f67a817f1 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1210,7 +1210,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - folio_add_new_anon_rmap(folio, vma, address); + folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); diff --git a/mm/memory.c b/mm/memory.c index d8a0b7d2e15b..a4fc6e632d2c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -930,7 +930,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma *prealloc = NULL; copy_user_highpage(&new_folio->page, page, addr, src_vma); __folio_mark_uptodate(new_folio); - folio_add_new_anon_rmap(new_folio, dst_vma, addr); + folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, dst_vma); rss[MM_ANONPAGES]++; @@ -3402,7 +3402,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * some TLBs while the old PTE remains in others. */ ptep_clear_flush(vma, vmf->address, vmf->pte); - folio_add_new_anon_rmap(new_folio, vma, vmf->address); + folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); BUG_ON(unshare && pte_write(entry)); set_pte_at(mm, vmf->address, vmf->pte, entry); @@ -4339,7 +4339,7 @@ check_folio: /* ksm created a completely new copy */ if (unlikely(folio != swapcache && swapcache)) { - folio_add_new_anon_rmap(folio, vma, address); + folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else { folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, @@ -4594,7 +4594,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) #ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); #endif - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); setpte: if (vmf_orig_pte_uffd_wp(vmf)) @@ -4792,7 +4792,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { VM_BUG_ON_FOLIO(nr != 1, folio); - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else { folio_add_file_rmap_ptes(folio, page, nr, vma); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 051d0a3ccbee..6d66dc1c6ffa 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -658,7 +658,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, goto unlock_abort; inc_mm_counter(mm, MM_ANONPAGES); - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); if (!folio_is_zone_device(folio)) folio_add_lru_vma(folio, vma); folio_get(folio); diff --git a/mm/rmap.c b/mm/rmap.c index 69cbd7ac2a5c..c0c99f91ade1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1401,30 +1401,35 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, * @folio: The folio to add the mapping to. * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped + * @flags: The rmap flags * * Like folio_add_anon_rmap_*() but must only be called on *new* folios. * This means the inc-and-test can be bypassed. - * The folio does not have to be locked. + * The folio doesn't necessarily need to be locked while it's exclusive + * unless two threads map it concurrently. However, the folio must be + * locked if it's shared. * - * If the folio is pmd-mappable, it is accounted as a THP. As the folio - * is new, it's assumed to be mapped exclusively by a single process. + * If the folio is pmd-mappable, it is accounted as a THP. */ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, - unsigned long address) + unsigned long address, rmap_t flags) { - int nr = folio_nr_pages(folio); + const int nr = folio_nr_pages(folio); + const bool exclusive = flags & RMAP_EXCLUSIVE; int nr_pmdmapped = 0; VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio); VM_BUG_ON_VMA(address < vma->vm_start || address + (nr << PAGE_SHIFT) > vma->vm_end, vma); __folio_set_swapbacked(folio); - __folio_set_anon(folio, vma, address, true); + __folio_set_anon(folio, vma, address, exclusive); if (likely(!folio_test_large(folio))) { /* increment count (starts at -1) */ atomic_set(&folio->_mapcount, 0); - SetPageAnonExclusive(&folio->page); + if (exclusive) + SetPageAnonExclusive(&folio->page); } else if (!folio_test_pmd_mappable(folio)) { int i; @@ -1433,7 +1438,8 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); - SetPageAnonExclusive(page); + if (exclusive) + SetPageAnonExclusive(page); } /* increment count (starts at -1) */ @@ -1445,7 +1451,8 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, /* increment count (starts at -1) */ atomic_set(&folio->_large_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED); - SetPageAnonExclusive(&folio->page); + if (exclusive) + SetPageAnonExclusive(&folio->page); nr_pmdmapped = nr; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 9c6d8e557c0f..ae1d2700f6a3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1911,7 +1911,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); } else { /* ksm created a completely new copy */ - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 5e7f2801698a..8dedaec00486 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -216,7 +216,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, folio_add_lru(folio); folio_add_file_rmap_pte(folio, page, dst_vma); } else { - folio_add_new_anon_rmap(folio, dst_vma, dst_addr); + folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, dst_vma); } -- cgit v1.2.3 From c02525a33969000fa7b595b743deb4d79804916b Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 21 Jun 2024 13:34:45 +0200 Subject: ftrace: unpoison ftrace_regs in ftrace_ops_list_func() Patch series "kmsan: Enable on s390", v7. Architectures use assembly code to initialize ftrace_regs and call ftrace_ops_list_func(). Therefore, from the KMSAN's point of view, ftrace_regs is poisoned on ftrace_ops_list_func entry(). This causes KMSAN warnings when running the ftrace testsuite. Fix by trusting the architecture-specific assembly code and always unpoisoning ftrace_regs in ftrace_ops_list_func. The issue was not encountered on x86_64 so far only by accident: assembly-allocated ftrace_regs was overlapping a stale partially unpoisoned stack frame. Poisoning stack frames before returns [1] makes the issue appear on x86_64 as well. [1] https://github.com/iii-i/llvm-project/commits/msan-poison-allocas-before-returning-2024-06-12/ Link: https://lkml.kernel.org/r/20240621113706.315500-1-iii@linux.ibm.com Link: https://lkml.kernel.org/r/20240621113706.315500-2-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Reviewed-by: Alexander Potapenko Acked-by: Steven Rostedt (Google) Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Cc: Marco Elver Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Pekka Enberg Cc: Roman Gushchin Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- kernel/trace/ftrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eacab4020508..f1150f081d6b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7404,6 +7404,7 @@ out: void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { + kmsan_unpoison_memory(fregs, sizeof(*fregs)); __ftrace_ops_list_func(ip, parent_ip, NULL, fregs); } #else -- cgit v1.2.3 From 2a22b773b15f5aa97c029acad79bda11ce5f2b4d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 Jun 2024 17:29:24 +0200 Subject: memcg: mm_update_next_owner: kill the "retry" logic Add the new helper, try_to_set_owner(), which tries to update mm->owner once we see c->mm == mm. This way mm_update_next_owner() doesn't need to restart the list_for_each_entry/for_each_process loops from the very beginning if it races with exit/exec, it can just continue. Unlike the current code, try_to_set_owner() re-checks tsk->mm == mm before it drops tasklist_lock, so it doesn't need get/put_task_struct(). Link: https://lkml.kernel.org/r/20240626152924.GA17933@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Michal Hocko Cc: Christian Brauner Cc: Eric W. Biederman Cc: Jens Axboe Cc: Jinliang Zheng Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Tycho Andersen Signed-off-by: Andrew Morton --- kernel/exit.c | 57 +++++++++++++++++++++++++++------------------------------ 1 file changed, 27 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 81fcee45d630..877fae2cc705 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -439,6 +439,23 @@ static void coredump_task_exit(struct task_struct *tsk) } #ifdef CONFIG_MEMCG +/* drops tasklist_lock if succeeds */ +static bool try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) +{ + bool ret = false; + + task_lock(tsk); + if (likely(tsk->mm == mm)) { + /* tsk can't pass exit_mm/exec_mmap and exit */ + read_unlock(&tasklist_lock); + WRITE_ONCE(mm->owner, tsk); + lru_gen_migrate_mm(mm); + ret = true; + } + task_unlock(tsk); + return ret; +} + /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ @@ -446,7 +463,6 @@ void mm_update_next_owner(struct mm_struct *mm) { struct task_struct *c, *g, *p = current; -retry: /* * If the exiting or execing task is not the owner, it's * someone else's problem. @@ -468,16 +484,16 @@ retry: * Search in the children */ list_for_each_entry(c, &p->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; + if (c->mm == mm && try_to_set_owner(c, mm)) + goto ret; } /* * Search in the siblings */ list_for_each_entry(c, &p->real_parent->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; + if (c->mm == mm && try_to_set_owner(c, mm)) + goto ret; } /* @@ -489,9 +505,11 @@ retry: if (g->flags & PF_KTHREAD) continue; for_each_thread(g, c) { - if (c->mm == mm) - goto assign_new_owner; - if (c->mm) + struct mm_struct *c_mm = READ_ONCE(c->mm); + if (c_mm == mm) { + if (try_to_set_owner(c, mm)) + goto ret; + } else if (c_mm) break; } } @@ -502,30 +520,9 @@ retry: * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ WRITE_ONCE(mm->owner, NULL); + ret: return; -assign_new_owner: - BUG_ON(c == p); - get_task_struct(c); - /* - * The task_lock protects c->mm from changing. - * We always want mm->owner->mm == mm - */ - task_lock(c); - /* - * Delay read_unlock() till we have the task_lock() - * to ensure that c does not slip away underneath us - */ - read_unlock(&tasklist_lock); - if (c->mm != mm) { - task_unlock(c); - put_task_struct(c); - goto retry; - } - WRITE_ONCE(mm->owner, c); - lru_gen_migrate_mm(mm); - task_unlock(c); - put_task_struct(c); } #endif /* CONFIG_MEMCG */ -- cgit v1.2.3 From d73d00352145fb51d31771047aa939850d87fa50 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 Jun 2024 17:29:30 +0200 Subject: memcg: mm_update_next_owner: move for_each_thread() into try_to_set_owner() mm_update_next_owner() checks the children / real_parent->children to avoid the "everything else" loop in the likely case, but this won't work if a child/sibling has a zombie leader with ->mm == NULL. Move the for_each_thread() logic into try_to_set_owner(), if nothing else this makes the children/siblings/everything searches more consistent. Link: https://lkml.kernel.org/r/20240626152930.GA17936@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Michal Hocko Cc: Christian Brauner Cc: Eric W. Biederman Cc: Jens Axboe Cc: Jinliang Zheng Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Tycho Andersen Signed-off-by: Andrew Morton --- kernel/exit.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 877fae2cc705..a5dd736c6767 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -440,7 +440,7 @@ static void coredump_task_exit(struct task_struct *tsk) #ifdef CONFIG_MEMCG /* drops tasklist_lock if succeeds */ -static bool try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) +static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) { bool ret = false; @@ -456,12 +456,28 @@ static bool try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) return ret; } +static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(g, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm == mm) { + if (__try_to_set_owner(t, mm)) + return true; + } else if (t_mm) + break; + } + + return false; +} + /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { - struct task_struct *c, *g, *p = current; + struct task_struct *g, *p = current; /* * If the exiting or execing task is not the owner, it's @@ -483,19 +499,17 @@ void mm_update_next_owner(struct mm_struct *mm) /* * Search in the children */ - list_for_each_entry(c, &p->children, sibling) { - if (c->mm == mm && try_to_set_owner(c, mm)) + list_for_each_entry(g, &p->children, sibling) { + if (try_to_set_owner(g, mm)) goto ret; } - /* * Search in the siblings */ - list_for_each_entry(c, &p->real_parent->children, sibling) { - if (c->mm == mm && try_to_set_owner(c, mm)) + list_for_each_entry(g, &p->real_parent->children, sibling) { + if (try_to_set_owner(g, mm)) goto ret; } - /* * Search through everything else, we should not get here often. */ @@ -504,14 +518,8 @@ void mm_update_next_owner(struct mm_struct *mm) break; if (g->flags & PF_KTHREAD) continue; - for_each_thread(g, c) { - struct mm_struct *c_mm = READ_ONCE(c->mm); - if (c_mm == mm) { - if (try_to_set_owner(c, mm)) - goto ret; - } else if (c_mm) - break; - } + if (try_to_set_owner(g, mm)) + goto ret; } read_unlock(&tasklist_lock); /* -- cgit v1.2.3 From 8ac5dc66599c5c545cefd314dd34a109edce2784 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 Jun 2024 21:10:17 +0200 Subject: get_task_mm: check PF_KTHREAD lockless Nowadays PF_KTHREAD is sticky and it was never protected by ->alloc_lock. Move the PF_KTHREAD check outside of task_lock() section to make this code more understandable. Link: https://lkml.kernel.org/r/20240626191017.GA20031@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Michal Hocko Cc: Eric W. Biederman Signed-off-by: Andrew Morton --- kernel/fork.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 99076dbe27d8..279efadabbf2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1536,14 +1536,13 @@ struct mm_struct *get_task_mm(struct task_struct *task) { struct mm_struct *mm; + if (task->flags & PF_KTHREAD) + return NULL; + task_lock(task); mm = task->mm; - if (mm) { - if (task->flags & PF_KTHREAD) - mm = NULL; - else - mmget(mm); - } + if (mm) + mmget(mm); task_unlock(task); return mm; } -- cgit v1.2.3 From 3a3b7fec3974f954600844e41d773c00857ef48a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 1 Jul 2024 11:31:15 -0400 Subject: mm: remove CONFIG_MEMCG_KMEM CONFIG_MEMCG_KMEM used to be a user-visible option for whether slab tracking is enabled. It has been default-enabled and equivalent to CONFIG_MEMCG for almost a decade. We've only grown more kernel memory accounting sites since, and there is no imaginable cgroup usecase going forward that wants to track user pages but not the multitude of user-drivable kernel allocations. Link: https://lkml.kernel.org/r/20240701153148.452230-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: David Hildenbrand Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/bpf.h | 4 +-- include/linux/list_lru.h | 2 +- include/linux/memcontrol.h | 22 +++---------- include/linux/sched.h | 3 +- include/linux/slab.h | 12 +++---- include/trace/events/kmem.h | 4 +-- init/Kconfig | 5 --- kernel/bpf/memalloc.c | 9 ++---- kernel/bpf/syscall.c | 6 ++-- mm/kfence/core.c | 6 ++-- mm/kfence/kfence.h | 2 +- mm/list_lru.c | 14 ++++---- mm/memcontrol-v1.c | 6 ++-- mm/memcontrol.c | 60 ++++------------------------------- mm/percpu-internal.h | 6 ++-- mm/percpu.c | 6 ++-- mm/slab.h | 2 +- mm/slab_common.c | 10 +++--- mm/slub.c | 10 +++--- tools/testing/selftests/cgroup/config | 1 - 20 files changed, 59 insertions(+), 131 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5e694a308081..b8637555c9c2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -275,7 +275,7 @@ struct bpf_map { u32 btf_value_type_id; u32 btf_vmlinux_value_type_id; struct btf *btf; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; @@ -2252,7 +2252,7 @@ struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 792b67ceb631..5099a8ccd5f4 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -50,7 +50,7 @@ struct list_lru_node { struct list_lru { struct list_lru_node *node; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct list_head list; int shrinker_id; bool memcg_aware; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 60418934827c..7e2eb091049a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -195,7 +195,7 @@ struct mem_cgroup { /* Range enforcement for interrupt charges */ struct work_struct high_work; -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#ifdef CONFIG_ZSWAP unsigned long zswap_max; /* @@ -236,7 +236,6 @@ struct mem_cgroup { */ unsigned long socket_pressure; -#ifdef CONFIG_MEMCG_KMEM int kmemcg_id; /* * memcg->objcg is wiped out as a part of the objcg repaprenting @@ -247,7 +246,6 @@ struct mem_cgroup { struct obj_cgroup *orig_objcg; /* list of inherited objcgs, protected by objcg_lock */ struct list_head objcg_list; -#endif struct memcg_vmstats_percpu __percpu *vmstats_percpu; @@ -532,7 +530,6 @@ retry: return memcg; } -#ifdef CONFIG_MEMCG_KMEM /* * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set. * @folio: Pointer to the folio. @@ -548,15 +545,6 @@ static inline bool folio_memcg_kmem(struct folio *folio) return folio->memcg_data & MEMCG_DATA_KMEM; } - -#else -static inline bool folio_memcg_kmem(struct folio *folio) -{ - return false; -} - -#endif - static inline bool PageMemcgKmem(struct page *page) { return folio_memcg_kmem(page_folio(page)); @@ -1488,7 +1476,7 @@ static inline void split_page_memcg(struct page *head, int old_order, int new_or * if MEMCG_DATA_OBJEXTS is set. */ struct slabobj_ext { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING @@ -1663,7 +1651,7 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg, } #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG bool mem_cgroup_kmem_disabled(void); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); @@ -1806,9 +1794,9 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); diff --git a/include/linux/sched.h b/include/linux/sched.h index a7770c566c4d..82da65131a6b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1457,9 +1457,8 @@ struct task_struct { /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; -#endif -#ifdef CONFIG_MEMCG_KMEM + /* Cache for current->cgroups->memcg->objcg lookups: */ struct obj_cgroup *objcg; #endif diff --git a/include/linux/slab.h b/include/linux/slab.h index 7247e217e21b..a332dd2fa6cd 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -41,7 +41,7 @@ enum _slab_flag_bits { #ifdef CONFIG_FAILSLAB _SLAB_FAILSLAB, #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG _SLAB_ACCOUNT, #endif #ifdef CONFIG_KASAN_GENERIC @@ -171,7 +171,7 @@ enum _slab_flag_bits { # define SLAB_FAILSLAB __SLAB_FLAG_UNUSED #endif /* Account to memcg */ -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG # define SLAB_ACCOUNT __SLAB_FLAG_BIT(_SLAB_ACCOUNT) #else # define SLAB_ACCOUNT __SLAB_FLAG_UNUSED @@ -407,7 +407,7 @@ enum kmalloc_cache_type { #ifndef CONFIG_ZONE_DMA KMALLOC_DMA = KMALLOC_NORMAL, #endif -#ifndef CONFIG_MEMCG_KMEM +#ifndef CONFIG_MEMCG KMALLOC_CGROUP = KMALLOC_NORMAL, #endif KMALLOC_RANDOM_START = KMALLOC_NORMAL, @@ -420,7 +420,7 @@ enum kmalloc_cache_type { #ifdef CONFIG_ZONE_DMA KMALLOC_DMA, #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG KMALLOC_CGROUP, #endif NR_KMALLOC_TYPES @@ -435,7 +435,7 @@ kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; #define KMALLOC_NOT_NORMAL_BITS \ (__GFP_RECLAIMABLE | \ (IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \ - (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0)) + (IS_ENABLED(CONFIG_MEMCG) ? __GFP_ACCOUNT : 0)) extern unsigned long random_kmalloc_seed; @@ -463,7 +463,7 @@ static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags, unsigne */ if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA)) return KMALLOC_DMA; - if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE)) + if (!IS_ENABLED(CONFIG_MEMCG) || (flags & __GFP_RECLAIMABLE)) return KMALLOC_RECLAIM; else return KMALLOC_CGROUP; diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 8a829e0f6e55..b37eb0a7060f 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -36,7 +36,7 @@ TRACE_EVENT(kmem_cache_alloc, __entry->bytes_alloc = s->size; __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->node = node; - __entry->accounted = IS_ENABLED(CONFIG_MEMCG_KMEM) ? + __entry->accounted = IS_ENABLED(CONFIG_MEMCG) ? ((gfp_flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)) : false; ), @@ -87,7 +87,7 @@ TRACE_EVENT(kmalloc, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags), __entry->node, - (IS_ENABLED(CONFIG_MEMCG_KMEM) && + (IS_ENABLED(CONFIG_MEMCG) && (__entry->gfp_flags & (__force unsigned long)__GFP_ACCOUNT)) ? "true" : "false") ); diff --git a/init/Kconfig b/init/Kconfig index aca0ae9be04f..26bf8bb0a7ce 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -986,11 +986,6 @@ config MEMCG_V1 San N is unsure. -config MEMCG_KMEM - bool - depends on MEMCG - default y - config BLK_CGROUP bool "IO controller" depends on BLOCK diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index a546aba46d5d..dec892ded031 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -155,12 +155,9 @@ static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (c->objcg) return get_mem_cgroup_from_objcg(c->objcg); -#endif - -#ifdef CONFIG_MEMCG return root_mem_cgroup; #else return NULL; @@ -534,7 +531,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) size += LLIST_NODE_SZ; /* room for llist_node */ unit_size = size; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (memcg_bpf_enabled()) objcg = get_obj_cgroup_from_current(); #endif @@ -556,7 +553,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); if (!pcc) return -ENOMEM; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG objcg = get_obj_cgroup_from_current(); #endif ma->objcg = objcg; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f45ed6adc092..8f716f06c345 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -385,7 +385,7 @@ void bpf_map_free_id(struct bpf_map *map) spin_unlock_irqrestore(&map_idr_lock, flags); } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG static void bpf_map_save_memcg(struct bpf_map *map) { /* Currently if a map is created by a process belonging to the root @@ -486,7 +486,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, unsigned long i, j; struct page *pg; int ret = 0; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct mem_cgroup *memcg, *old_memcg; memcg = bpf_map_get_memcg(map); @@ -505,7 +505,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, break; } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG set_active_memcg(old_memcg); mem_cgroup_put(memcg); #endif diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 83f8e78827c0..c5cb54fc696d 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -602,7 +602,7 @@ static unsigned long kfence_init_pool(void) continue; __folio_set_slab(slab_folio(slab)); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | MEMCG_DATA_OBJEXTS; #endif @@ -652,7 +652,7 @@ reset_slab: if (!i || (i % 2)) continue; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG slab->obj_exts = 0; #endif __folio_clear_slab(slab_folio(slab)); @@ -1146,7 +1146,7 @@ void __kfence_free(void *addr) { struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG KFENCE_WARN_ON(meta->obj_exts.objcg); #endif /* diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 084f5f36e8e7..db87a05047bd 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -97,7 +97,7 @@ struct kfence_metadata { struct kfence_track free_track; /* For updating alloc_covered on frees. */ u32 alloc_stack_hash; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct slabobj_ext obj_exts; #endif }; diff --git a/mm/list_lru.c b/mm/list_lru.c index 3fd64736bc45..a29d96929d7c 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -15,7 +15,7 @@ #include "slab.h" #include "internal.h" -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG static LIST_HEAD(memcg_list_lrus); static DEFINE_MUTEX(list_lrus_mutex); @@ -83,7 +83,7 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { return &lru->node[nid].lru; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) @@ -294,7 +294,7 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, nr_to_walk); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { struct list_lru_memcg *mlru; unsigned long index; @@ -324,7 +324,7 @@ static void init_one_lru(struct list_lru_one *l) l->nr_items = 0; } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) { int nid; @@ -544,14 +544,14 @@ static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) static void memcg_destroy_list_lru(struct list_lru *lru) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key, struct shrinker *shrinker) { int i; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (shrinker) lru->shrinker_id = shrinker->id; else @@ -591,7 +591,7 @@ void list_lru_destroy(struct list_lru *lru) kfree(lru->node); lru->node = NULL; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG lru->shrinker_id = -1; #endif } diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 7218884bc3c9..6b3e56e88a8a 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -2756,7 +2756,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, return 0; } -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG) +#ifdef CONFIG_SLUB_DEBUG static int mem_cgroup_slab_show(struct seq_file *m, void *p) { /* @@ -2863,7 +2863,7 @@ struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG) +#ifdef CONFIG_SLUB_DEBUG { .name = "kmem.slabinfo", .seq_show = mem_cgroup_slab_show, @@ -2922,7 +2922,6 @@ struct cftype memsw_files[] = { { }, /* terminate */ }; -#ifdef CONFIG_MEMCG_KMEM void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) { if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { @@ -2932,7 +2931,6 @@ void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) page_counter_uncharge(&memcg->kmem, -nr_pages); } } -#endif /* CONFIG_MEMCG_KMEM */ bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, gfp_t gfp_mask) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a2339396cfcb..9ddce038ddda 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -118,7 +118,6 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) #define CURRENT_OBJCG_UPDATE_BIT 0 #define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT) -#ifdef CONFIG_MEMCG_KMEM static DEFINE_SPINLOCK(objcg_lock); bool mem_cgroup_kmem_disabled(void) @@ -223,7 +222,6 @@ EXPORT_SYMBOL(memcg_kmem_online_key); DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key); EXPORT_SYMBOL(memcg_bpf_enabled_key); -#endif /** * mem_cgroup_css_from_folio - css of the memcg associated with a folio @@ -423,7 +421,7 @@ static const unsigned int memcg_vm_event_stat[] = { PGDEACTIVATE, PGLAZYFREE, PGLAZYFREED, -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#ifdef CONFIG_ZSWAP ZSWPIN, ZSWPOUT, ZSWPWB, @@ -1346,7 +1344,7 @@ static const struct memory_stat memory_stats[] = { { "sock", MEMCG_SOCK }, { "vmalloc", MEMCG_VMALLOC }, { "shmem", NR_SHMEM }, -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#ifdef CONFIG_ZSWAP { "zswap", MEMCG_ZSWAP_B }, { "zswapped", MEMCG_ZSWAPPED }, #endif @@ -1700,13 +1698,11 @@ struct memcg_stock_pcp { struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; -#ifdef CONFIG_MEMCG_KMEM struct obj_cgroup *cached_objcg; struct pglist_data *cached_pgdat; unsigned int nr_bytes; int nr_slab_reclaimable_b; int nr_slab_unreclaimable_b; -#endif struct work_struct work; unsigned long flags; @@ -1717,23 +1713,10 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { }; static DEFINE_MUTEX(percpu_charge_mutex); -#ifdef CONFIG_MEMCG_KMEM static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); -#else -static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) -{ - return NULL; -} -static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, - struct mem_cgroup *root_memcg) -{ - return false; -} -#endif - /** * consume_stock: Try to consume stocked charge on this cpu. * @memcg: memcg to consume from. @@ -2412,8 +2395,6 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) local_irq_enable(); } -#ifdef CONFIG_MEMCG_KMEM - static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) @@ -3069,7 +3050,6 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, obj_cgroup_put(objcg); } } -#endif /* CONFIG_MEMCG_KMEM */ /* * Because folio_memcg(head) is not set on tails, set it now. @@ -3116,7 +3096,6 @@ unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) return val; } -#ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { struct obj_cgroup *objcg; @@ -3167,15 +3146,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) */ memcg_reparent_list_lrus(memcg, parent); } -#else -static int memcg_online_kmem(struct mem_cgroup *memcg) -{ - return 0; -} -static void memcg_offline_kmem(struct mem_cgroup *memcg) -{ -} -#endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_CGROUP_WRITEBACK @@ -3590,10 +3560,8 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) vmpressure_init(&memcg->vmpressure); memcg->socket_pressure = jiffies; memcg1_memcg_init(memcg); -#ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; INIT_LIST_HEAD(&memcg->objcg_list); -#endif #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) @@ -3627,7 +3595,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg1_soft_limit_reset(memcg); -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#ifdef CONFIG_ZSWAP memcg->zswap_max = PAGE_COUNTER_MAX; WRITE_ONCE(memcg->zswap_writeback, !parent || READ_ONCE(parent->zswap_writeback)); @@ -3659,10 +3627,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_inc(&memcg_sockets_enabled_key); -#if defined(CONFIG_MEMCG_KMEM) if (!cgroup_memory_nobpf) static_branch_inc(&memcg_bpf_enabled_key); -#endif return &memcg->css; } @@ -3755,10 +3721,8 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg1_tcpmem_active(memcg)) static_branch_dec(&memcg_sockets_enabled_key); -#if defined(CONFIG_MEMCG_KMEM) if (!cgroup_memory_nobpf) static_branch_dec(&memcg_bpf_enabled_key); -#endif vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); @@ -3901,7 +3865,6 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) atomic64_set(&memcg->vmstats->stats_updates, 0); } -#ifdef CONFIG_MEMCG_KMEM static void mem_cgroup_fork(struct task_struct *task) { /* @@ -3929,7 +3892,6 @@ static void mem_cgroup_exit(struct task_struct *task) */ task->objcg = NULL; } -#endif #ifdef CONFIG_LRU_GEN static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) @@ -3953,7 +3915,6 @@ static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {} #endif /* CONFIG_LRU_GEN */ -#ifdef CONFIG_MEMCG_KMEM static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) { struct task_struct *task; @@ -3964,17 +3925,12 @@ static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg); } } -#else -static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) {} -#endif /* CONFIG_MEMCG_KMEM */ -#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM) static void mem_cgroup_attach(struct cgroup_taskset *tset) { mem_cgroup_lru_gen_attach(tset); mem_cgroup_kmem_attach(tset); } -#endif static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { @@ -4421,13 +4377,9 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_free = mem_cgroup_css_free, .css_reset = mem_cgroup_css_reset, .css_rstat_flush = mem_cgroup_css_rstat_flush, -#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM) .attach = mem_cgroup_attach, -#endif -#ifdef CONFIG_MEMCG_KMEM .fork = mem_cgroup_fork, .exit = mem_cgroup_exit, -#endif .dfl_cftypes = memory_files, #ifdef CONFIG_MEMCG_V1 .can_attach = memcg1_can_attach, @@ -5395,7 +5347,7 @@ static struct cftype swap_files[] = { { } /* terminate */ }; -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#ifdef CONFIG_ZSWAP /** * obj_cgroup_may_zswap - check if this cgroup can zswap * @objcg: the object cgroup @@ -5577,7 +5529,7 @@ static struct cftype zswap_files[] = { }, { } /* terminate */ }; -#endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ +#endif /* CONFIG_ZSWAP */ static int __init mem_cgroup_swap_init(void) { @@ -5588,7 +5540,7 @@ static int __init mem_cgroup_swap_init(void) #ifdef CONFIG_MEMCG_V1 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); #endif -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#ifdef CONFIG_ZSWAP WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files)); #endif return 0; diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 7e42f0ca3b7b..4b3d6ec43703 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -33,7 +33,7 @@ struct pcpu_block_md { }; struct pcpuobj_ext { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct obj_cgroup *cgroup; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING @@ -41,7 +41,7 @@ struct pcpuobj_ext { #endif }; -#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MEM_ALLOC_PROFILING) +#if defined(CONFIG_MEMCG) || defined(CONFIG_MEM_ALLOC_PROFILING) #define NEED_PCPUOBJ_EXT #endif @@ -154,7 +154,7 @@ static inline size_t pcpu_obj_full_size(size_t size) { size_t extra_size = 0; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (!mem_cgroup_kmem_disabled()) extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); #endif diff --git a/mm/percpu.c b/mm/percpu.c index 474e3683b74d..20d91af8c033 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1619,7 +1619,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) return pcpu_get_page_chunk(pcpu_addr_to_page(addr)); } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) { @@ -1681,7 +1681,7 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) obj_cgroup_put(objcg); } -#else /* CONFIG_MEMCG_KMEM */ +#else /* CONFIG_MEMCG */ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) { @@ -1697,7 +1697,7 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ #ifdef CONFIG_MEM_ALLOC_PROFILING static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, diff --git a/mm/slab.h b/mm/slab.h index 5f8f47c5bee0..3586e6183224 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -573,7 +573,7 @@ static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, gfp_t flags, size_t size, void **p); void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, diff --git a/mm/slab_common.c b/mm/slab_common.c index 1560a1546bb1..60268bb258fc 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -725,7 +725,7 @@ EXPORT_SYMBOL(kmalloc_size_roundup); #define KMALLOC_DMA_NAME(sz) #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG #define KMALLOC_CGROUP_NAME(sz) .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz, #else #define KMALLOC_CGROUP_NAME(sz) @@ -867,7 +867,7 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type) if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) { flags |= SLAB_RECLAIM_ACCOUNT; - } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) { + } else if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_CGROUP)) { if (mem_cgroup_kmem_disabled()) { kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx]; return; @@ -883,10 +883,10 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type) #endif /* - * If CONFIG_MEMCG_KMEM is enabled, disable cache merging for + * If CONFIG_MEMCG is enabled, disable cache merging for * KMALLOC_NORMAL caches. */ - if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_NORMAL)) + if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_NORMAL)) flags |= SLAB_NO_MERGE; if (minalign > ARCH_KMALLOC_MINALIGN) { @@ -913,7 +913,7 @@ void __init create_kmalloc_caches(void) enum kmalloc_cache_type type; /* - * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined + * Including KMALLOC_CGROUP if CONFIG_MEMCG defined */ for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) { /* Caches that are NOT of the two-to-the-power-of size. */ diff --git a/mm/slub.c b/mm/slub.c index 177ad7d3288b..cc11f3869cc6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2020,7 +2020,7 @@ static inline bool need_slab_obj_ext(void) return true; /* - * CONFIG_MEMCG_KMEM creates vector of obj_cgroup objects conditionally + * CONFIG_MEMCG creates vector of obj_cgroup objects conditionally * inside memcg_slab_post_alloc_hook. No other users for now. */ return false; @@ -2104,7 +2104,7 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, #endif /* CONFIG_SLAB_OBJ_EXT */ -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG static void memcg_alloc_abort_single(struct kmem_cache *s, void *object); @@ -2146,7 +2146,7 @@ void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, __memcg_slab_free_hook(s, slab, p, objects, obj_exts); } -#else /* CONFIG_MEMCG_KMEM */ +#else /* CONFIG_MEMCG */ static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, gfp_t flags, size_t size, @@ -2159,7 +2159,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ /* * Hooks for other subsystems that check memory allocations. In a typical @@ -4456,7 +4456,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, do_slab_free(s, slab, object, object, 1, addr); } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG /* Do not inline the rare memcg charging failed path into the allocation path */ static noinline void memcg_alloc_abort_single(struct kmem_cache *s, void *object) diff --git a/tools/testing/selftests/cgroup/config b/tools/testing/selftests/cgroup/config index 97d549ee894f..39f979690dd3 100644 --- a/tools/testing/selftests/cgroup/config +++ b/tools/testing/selftests/cgroup/config @@ -3,5 +3,4 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_MEMCG=y -CONFIG_MEMCG_KMEM=y CONFIG_PAGE_COUNTER=y -- cgit v1.2.3 From 66b4aaf7335c9e7f59ba93a45379ba2bbfe1b913 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 1 Jul 2024 01:34:09 +0000 Subject: kernel/fork.c: get totalram_pages from memblock to calculate max_threads Since we plan to move the accounting into __free_pages_core(), totalram_pages may not represent the total usable pages on system at this point when defer_init is enabled. Instead we can get the total usable pages from memblock directly. Link: https://lkml.kernel.org/r/20240701013410.17260-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Mike Rapoport (IBM) Cc: David Hildenbrand Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/fork.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 279efadabbf2..60309c6d6074 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -999,7 +1000,7 @@ void __init __weak arch_task_cache_init(void) { } static void set_max_threads(unsigned int max_threads_suggested) { u64 threads; - unsigned long nr_pages = totalram_pages(); + unsigned long nr_pages = PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size()); /* * The number of threads shall be limited such that the thread -- cgit v1.2.3 From 9325585288f2742b4b6effd5246154c374b9100f Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 1 Jul 2024 01:34:10 +0000 Subject: kernel/fork.c: put set_max_threads()/task_struct_whitelist() in __init section The functions set_max_threads() and task_struct_whitelist() are only used by fork_init() during bootup. Let's add __init tag to them. Link: https://lkml.kernel.org/r/20240701013410.17260-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Suggested-by: Oleg Nesterov Cc: David Hildenbrand Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 60309c6d6074..b56b37c484d1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -997,7 +997,7 @@ void __init __weak arch_task_cache_init(void) { } /* * set_max_threads */ -static void set_max_threads(unsigned int max_threads_suggested) +static void __init set_max_threads(unsigned int max_threads_suggested) { u64 threads; unsigned long nr_pages = PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size()); @@ -1023,7 +1023,7 @@ static void set_max_threads(unsigned int max_threads_suggested) int arch_task_struct_size __read_mostly; #endif -static void task_struct_whitelist(unsigned long *offset, unsigned long *size) +static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size) { /* Fetch thread_struct whitelist for the architecture. */ arch_thread_struct_whitelist(offset, size); -- cgit v1.2.3 From 18d095b2556e5e1292003c8e9f5d845ed42ef89b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 2 Jul 2024 15:51:19 +0200 Subject: mm: define __pte_leaf_size() to also take a PMD entry On powerpc 8xx, when a page is 8M size, the information is in the PMD entry. So allow architectures to provide __pte_leaf_size() instead of pte_leaf_size() and provide the PMD entry to that function. When __pte_leaf_size() is not defined, define it as a pte_leaf_size() so that architectures not interested in the PMD arguments are not impacted. Only define a default pte_leaf_size() when __pte_leaf_size() is not defined to make sure nobody adds new calls to pte_leaf_size() in the core. Link: https://lkml.kernel.org/r/c7c008f0a314bf8029ad7288fdc908db1ec7e449.1719928057.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Reviewed-by: Oscar Salvador Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 3 +++ kernel/events/core.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2f32eaccf0b9..2a6a3cccfc36 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1907,9 +1907,12 @@ typedef unsigned int pgtbl_mod_mask; #ifndef pmd_leaf_size #define pmd_leaf_size(x) PMD_SIZE #endif +#ifndef __pte_leaf_size #ifndef pte_leaf_size #define pte_leaf_size(x) PAGE_SIZE #endif +#define __pte_leaf_size(x,y) pte_leaf_size(y) +#endif /* * We always define pmd_pfn for all archs as it's used in lots of generic diff --git a/kernel/events/core.c b/kernel/events/core.c index 8f908f077935..b2ca11bdc11e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7609,7 +7609,7 @@ again: pte = ptep_get_lockless(ptep); if (pte_present(pte)) - size = pte_leaf_size(pte); + size = __pte_leaf_size(pmd, pte); pte_unmap(ptep); #endif /* CONFIG_HAVE_GUP_FAST */ -- cgit v1.2.3 From a7526fe8b94eced7d82aa00b2bcca44e39ae0769 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 11 Jul 2024 18:35:30 +0200 Subject: mm, slab: put should_failslab() back behind CONFIG_SHOULD_FAILSLAB Patch series "revert unconditional slab and page allocator fault injection calls". These two patches largely revert commits that added function call overhead into slab and page allocation hotpaths and that cannot be currently disabled even though related CONFIG_ options do exist. A much more involved solution that can keep the callsites always existing but hidden behind a static key if unused, is possible [1] and can be pursued by anyone who believes it's necessary. Meanwhile the fact the should_failslab() error injection is already not functional on kernels built with current gcc without anyone noticing [2], and lukewarm response to [1] suggests the need is not there. I believe it will be more fair to have the state after this series as a baseline for possible further optimisation, instead of the unconditional overhead. For example a possible compromise for anyone who's fine with an empty function call overhead but not the full CONFIG_FAILSLAB / CONFIG_FAIL_PAGE_ALLOC overhead is to reuse patch 1 from [1] but insert a static key check only inside should_failslab() and should_fail_alloc_page() before performing the more expensive checks. [1] https://lore.kernel.org/all/20240620-fault-injection-statickeys-v2-0-e23947d3d84b@suse.cz/#t [2] https://github.com/bpftrace/bpftrace/issues/3258 This patch (of 2): This mostly reverts commit 4f6923fbb352 ("mm: make should_failslab always available for fault injection"). The commit made should_failslab() a noinline function that's always called from the slab allocation hotpath, even if it's empty because CONFIG_SHOULD_FAILSLAB is not enabled, and there is no option to disable that call. This is visible in profiles and the function call overhead can be noticeable especially with cpu mitigations. Meanwhile the bpftrace program example in the commit silently does not work without CONFIG_SHOULD_FAILSLAB anyway with a recent gcc, because the empty function gets a .constprop clone that is actually being called (uselessly) from the slab hotpath, while the error injection is hooked to the original function that's not being called at all [1]. Thus put the whole should_failslab() function back behind CONFIG_SHOULD_FAILSLAB. It's not a complete revert of 4f6923fbb352 - the int return type that returns -ENOMEM on failure is preserved, as well ALLOW_ERROR_INJECTION annotation. The BTF_ID() record that was meanwhile added is also guarded by CONFIG_SHOULD_FAILSLAB. [1] https://github.com/bpftrace/bpftrace/issues/3258 Link: https://lkml.kernel.org/r/20240711-b4-fault-injection-reverts-v1-0-9e2651945d68@suse.cz Link: https://lkml.kernel.org/r/20240711-b4-fault-injection-reverts-v1-1-9e2651945d68@suse.cz Signed-off-by: Vlastimil Babka Cc: Akinobu Mita Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Christoph Lameter Cc: Daniel Borkmann Cc: David Rientjes Cc: Eduard Zingerman Cc: Hao Luo Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Martin KaFai Lau Cc: Mateusz Guzik Cc: Roman Gushchin Cc: Song Liu Cc: Stanislav Fomichev Cc: Yonghong Song Signed-off-by: Andrew Morton --- include/linux/fault-inject.h | 5 ++--- kernel/bpf/verifier.c | 2 ++ mm/failslab.c | 14 ++++++++------ mm/slub.c | 8 -------- 4 files changed, 12 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 6d5edef09d45..be6d0bc111ad 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -102,11 +102,10 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) } #endif /* CONFIG_FAIL_PAGE_ALLOC */ -int should_failslab(struct kmem_cache *s, gfp_t gfpflags); #ifdef CONFIG_FAILSLAB -extern bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags); +int should_failslab(struct kmem_cache *s, gfp_t gfpflags); #else -static inline bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) +static inline int should_failslab(struct kmem_cache *s, gfp_t gfpflags) { return false; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 214a9fa8c6fb..e455654f3b91 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21123,7 +21123,9 @@ BTF_SET_START(btf_non_sleepable_error_inject) */ BTF_ID(func, __filemap_add_folio) BTF_ID(func, should_fail_alloc_page) +#ifdef CONFIG_FAILSLAB BTF_ID(func, should_failslab) +#endif BTF_SET_END(btf_non_sleepable_error_inject) static int check_non_sleepable_error_inject(u32 btf_id) diff --git a/mm/failslab.c b/mm/failslab.c index ffc420c0e767..af16c2ed578f 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include #include "slab.h" @@ -14,23 +15,23 @@ static struct { .cache_filter = false, }; -bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) +int should_failslab(struct kmem_cache *s, gfp_t gfpflags) { int flags = 0; /* No fault-injection for bootstrap cache */ if (unlikely(s == kmem_cache)) - return false; + return 0; if (gfpflags & __GFP_NOFAIL) - return false; + return 0; if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_DIRECT_RECLAIM)) - return false; + return 0; if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB)) - return false; + return 0; /* * In some cases, it expects to specify __GFP_NOWARN @@ -41,8 +42,9 @@ bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) if (gfpflags & __GFP_NOWARN) flags |= FAULT_NOWARN; - return should_fail_ex(&failslab.attr, s->object_size, flags); + return should_fail_ex(&failslab.attr, s->object_size, flags) ? -ENOMEM : 0; } +ALLOW_ERROR_INJECTION(should_failslab, ERRNO); static int __init setup_failslab(char *str) { diff --git a/mm/slub.c b/mm/slub.c index cc11f3869cc6..b5aaaa3ca756 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3892,14 +3892,6 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, 0, sizeof(void *)); } -noinline int should_failslab(struct kmem_cache *s, gfp_t gfpflags) -{ - if (__should_failslab(s, gfpflags)) - return -ENOMEM; - return 0; -} -ALLOW_ERROR_INJECTION(should_failslab, ERRNO); - static __fastpath_inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { -- cgit v1.2.3 From 53dabce2652fb854eae84609ce9c37429d5d87ba Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 11 Jul 2024 18:35:31 +0200 Subject: mm, page_alloc: put should_fail_alloc_page() back behing CONFIG_FAIL_PAGE_ALLOC This mostly reverts commit af3b854492f3 ("mm/page_alloc.c: allow error injection"). The commit made should_fail_alloc_page() a noinline function that's always called from the page allocation hotpath, even if it's empty because CONFIG_FAIL_PAGE_ALLOC is not enabled, and there is no option to disable it and prevent the associated function call overhead. As with the preceding patch "mm, slab: put should_failslab back behind CONFIG_SHOULD_FAILSLAB" and for the same reasons, put the should_fail_alloc_page() back behind the config option. When enabled, the ALLOW_ERROR_INJECTION and BTF_ID records are preserved so it's not a complete revert. Link: https://lkml.kernel.org/r/20240711-b4-fault-injection-reverts-v1-2-9e2651945d68@suse.cz Signed-off-by: Vlastimil Babka Cc: Akinobu Mita Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Christoph Lameter Cc: Daniel Borkmann Cc: David Rientjes Cc: Eduard Zingerman Cc: Hao Luo Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Martin KaFai Lau Cc: Mateusz Guzik Cc: Roman Gushchin Cc: Song Liu Cc: Stanislav Fomichev Cc: Yonghong Song Signed-off-by: Andrew Morton --- include/linux/fault-inject.h | 6 ++---- kernel/bpf/verifier.c | 2 ++ mm/fail_page_alloc.c | 4 +++- mm/page_alloc.c | 6 ------ 4 files changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index be6d0bc111ad..354413950d34 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -91,12 +91,10 @@ static inline void fault_config_init(struct fault_config *config, struct kmem_cache; -bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); - #ifdef CONFIG_FAIL_PAGE_ALLOC -bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); +bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); #else -static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { return false; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e455654f3b91..a81e18409ec9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21122,7 +21122,9 @@ BTF_SET_START(btf_non_sleepable_error_inject) * Assume non-sleepable from bpf safety point of view. */ BTF_ID(func, __filemap_add_folio) +#ifdef CONFIG_FAIL_PAGE_ALLOC BTF_ID(func, should_fail_alloc_page) +#endif #ifdef CONFIG_FAILSLAB BTF_ID(func, should_failslab) #endif diff --git a/mm/fail_page_alloc.c b/mm/fail_page_alloc.c index b1b09cce9394..532851ce5132 100644 --- a/mm/fail_page_alloc.c +++ b/mm/fail_page_alloc.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include static struct { @@ -21,7 +22,7 @@ static int __init setup_fail_page_alloc(char *str) } __setup("fail_page_alloc=", setup_fail_page_alloc); -bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { int flags = 0; @@ -41,6 +42,7 @@ bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); } +ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c46aedfc9a12..3398d914ed83 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3050,12 +3050,6 @@ out: return page; } -noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) -{ - return __should_fail_alloc_page(gfp_mask, order); -} -ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); - static inline long __zone_watermark_unusable_free(struct zone *z, unsigned int order, unsigned int alloc_flags) { -- cgit v1.2.3