From 89b15332af7c0312a41e50846819ca6613b58b4c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:50:22 -0800 Subject: mm: drop mmap_sem before calling balance_dirty_pages() in write fault One of our services is observing hanging ps/top/etc under heavy write IO, and the task states show this is an mmap_sem priority inversion: A write fault is holding the mmap_sem in read-mode and waiting for (heavily cgroup-limited) IO in balance_dirty_pages(): balance_dirty_pages+0x724/0x905 balance_dirty_pages_ratelimited+0x254/0x390 fault_dirty_shared_page.isra.96+0x4a/0x90 do_wp_page+0x33e/0x400 __handle_mm_fault+0x6f0/0xfa0 handle_mm_fault+0xe4/0x200 __do_page_fault+0x22b/0x4a0 page_fault+0x45/0x50 Somebody tries to change the address space, contending for the mmap_sem in write-mode: call_rwsem_down_write_failed_killable+0x13/0x20 do_mprotect_pkey+0xa8/0x330 SyS_mprotect+0xf/0x20 do_syscall_64+0x5b/0x100 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 The waiting writer locks out all subsequent readers to avoid lock starvation, and several threads can be seen hanging like this: call_rwsem_down_read_failed+0x14/0x30 proc_pid_cmdline_read+0xa0/0x480 __vfs_read+0x23/0x140 vfs_read+0x87/0x130 SyS_read+0x42/0x90 do_syscall_64+0x5b/0x100 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 To fix this, do what we do for cache read faults already: drop the mmap_sem before calling into anything IO bound, in this case the balance_dirty_pages() function, and return VM_FAULT_RETRY. Link: http://lkml.kernel.org/r/20190924194238.GA29030@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Kirill A. Shutemov Cc: Josef Bacik Cc: Hillf Danton Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index b6a5d6a08438..9ea917e28ef4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2289,10 +2289,11 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) * * The function expects the page to be locked and unlocks it. */ -static void fault_dirty_shared_page(struct vm_area_struct *vma, - struct page *page) +static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct address_space *mapping; + struct page *page = vmf->page; bool dirtied; bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; @@ -2307,16 +2308,30 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma, mapping = page_rmapping(page); unlock_page(page); + if (!page_mkwrite) + file_update_time(vma->vm_file); + + /* + * Throttle page dirtying rate down to writeback speed. + * + * mapping may be NULL here because some device drivers do not + * set page.mapping but still dirty their pages + * + * Drop the mmap_sem before waiting on IO, if we can. The file + * is pinning the mapping, as per above. + */ if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ + struct file *fpin; + + fpin = maybe_unlock_mmap_for_io(vmf, NULL); balance_dirty_pages_ratelimited(mapping); + if (fpin) { + fput(fpin); + return VM_FAULT_RETRY; + } } - if (!page_mkwrite) - file_update_time(vma->vm_file); + return 0; } /* @@ -2571,6 +2586,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = VM_FAULT_WRITE; get_page(vmf->page); @@ -2594,10 +2610,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) wp_page_reuse(vmf); lock_page(vmf->page); } - fault_dirty_shared_page(vma, vmf->page); + ret |= fault_dirty_shared_page(vmf); put_page(vmf->page); - return VM_FAULT_WRITE; + return ret; } /* @@ -3641,7 +3657,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) return ret; } - fault_dirty_shared_page(vma, vmf->page); + ret |= fault_dirty_shared_page(vmf); return ret; } -- cgit v1.2.3 From b3d1411b6726ea6930222f8f12587d89762477c6 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 30 Nov 2019 17:50:30 -0800 Subject: mm: emit tracepoint when RSS changes Useful to track how RSS is changing per TGID to detect spikes in RSS and memory hogs. Several Android teams have been using this patch in various kernel trees for half a year now. Many reported to me it is really useful so I'm posting it upstream. Initial patch developed by Tim Murray. Changes I made from original patch: o Prevent any additional space consumed by mm_struct. Regarding the fact that the RSS may change too often thus flooding the traces - note that, there is some "hysterisis" with this already. That is - We update the counter only if we receive 64 page faults due to SPLIT_RSS_ACCOUNTING. However, during zapping or copying of pte range, the RSS is updated immediately which can become noisy/flooding. In a previous discussion, we agreed that BPF or ftrace can be used to rate limit the signal if this becomes an issue. Also note that I added wrappers to trace_rss_stat to prevent compiler errors where linux/mm.h is included from tracing code, causing errors such as: CC kernel/trace/power-traces.o In file included from ./include/trace/define_trace.h:102, from ./include/trace/events/kmem.h:342, from ./include/linux/mm.h:31, from ./include/linux/ring_buffer.h:5, from ./include/linux/trace_events.h:6, from ./include/trace/events/power.h:12, from kernel/trace/power-traces.c:15: ./include/trace/trace_events.h:113:22: error: field `ent' has incomplete type struct trace_entry ent; \ Link: http://lore.kernel.org/r/20190903200905.198642-1-joel@joelfernandes.org Link: http://lkml.kernel.org/r/20191001172817.234886-1-joel@joelfernandes.org Co-developed-by: Tim Murray Signed-off-by: Tim Murray Signed-off-by: Joel Fernandes (Google) Acked-by: Michal Hocko Cc: Carmen Jackson Cc: Mayank Gupta Cc: Daniel Colascione Cc: Steven Rostedt (VMware) Cc: Minchan Kim Cc: "Aneesh Kumar K.V" Cc: Dan Williams Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 14 +++++++++++--- include/trace/events/kmem.h | 21 +++++++++++++++++++++ mm/memory.c | 6 ++++++ 3 files changed, 38 insertions(+), 3 deletions(-) (limited to 'mm/memory.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index f6fb714fa851..935383081397 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1643,19 +1643,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) return (unsigned long)val; } +void mm_trace_rss_stat(int member, long count); + static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { - atomic_long_add(value, &mm->rss_stat.count[member]); + long count = atomic_long_add_return(value, &mm->rss_stat.count[member]); + + mm_trace_rss_stat(member, count); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { - atomic_long_inc(&mm->rss_stat.count[member]); + long count = atomic_long_inc_return(&mm->rss_stat.count[member]); + + mm_trace_rss_stat(member, count); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { - atomic_long_dec(&mm->rss_stat.count[member]); + long count = atomic_long_dec_return(&mm->rss_stat.count[member]); + + mm_trace_rss_stat(member, count); } /* Optimized variant when page is already known not to be PageAnon */ diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 69e8bb8963db..5a0666bfcf85 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -316,6 +316,27 @@ TRACE_EVENT(mm_page_alloc_extfrag, __entry->change_ownership) ); +TRACE_EVENT(rss_stat, + + TP_PROTO(int member, + long count), + + TP_ARGS(member, count), + + TP_STRUCT__entry( + __field(int, member) + __field(long, size) + ), + + TP_fast_assign( + __entry->member = member; + __entry->size = (count << PAGE_SHIFT); + ), + + TP_printk("member=%d size=%ldB", + __entry->member, + __entry->size) + ); #endif /* _TRACE_KMEM_H */ /* This part must be outside protection */ diff --git a/mm/memory.c b/mm/memory.c index 9ea917e28ef4..57c910aaba45 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -72,6 +72,8 @@ #include #include +#include + #include #include #include @@ -152,6 +154,10 @@ static int __init init_zero_pfn(void) } core_initcall(init_zero_pfn); +void mm_trace_rss_stat(int member, long count) +{ + trace_rss_stat(member, count); +} #if defined(SPLIT_RSS_COUNTING) -- cgit v1.2.3 From e4dcad204d3a281be6f8573e0a82648a4ad84e69 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 30 Nov 2019 17:50:33 -0800 Subject: rss_stat: add support to detect RSS updates of external mm When a process updates the RSS of a different process, the rss_stat tracepoint appears in the context of the process doing the update. This can confuse userspace that the RSS of process doing the update is updated, while in reality a different process's RSS was updated. This issue happens in reclaim paths such as with direct reclaim or background reclaim. This patch adds more information to the tracepoint about whether the mm being updated belongs to the current process's context (curr field). We also include a hash of the mm pointer so that the process who the mm belongs to can be uniquely identified (mm_id field). Also vsprintf.c is refactored a bit to allow reuse of hashing code. [akpm@linux-foundation.org: remove unused local `str'] [joelaf@google.com: inline call to ptr_to_hashval] Link: http://lore.kernel.org/r/20191113153816.14b95acd@gandalf.local.home Link: http://lkml.kernel.org/r/20191114164622.GC233237@google.com Link: http://lkml.kernel.org/r/20191106024452.81923-1-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reported-by: Ioannis Ilkos Acked-by: Petr Mladek [lib/vsprintf.c] Cc: Tim Murray Cc: Michal Hocko Cc: Carmen Jackson Cc: Mayank Gupta Cc: Daniel Colascione Cc: Steven Rostedt (VMware) Cc: Minchan Kim Cc: "Aneesh Kumar K.V" Cc: Dan Williams Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Vlastimil Babka Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 ++++---- include/linux/string.h | 2 ++ include/trace/events/kmem.h | 32 +++++++++++++++++++++++++++++--- lib/vsprintf.c | 40 +++++++++++++++++++++++++++++----------- mm/memory.c | 4 ++-- 5 files changed, 66 insertions(+), 20 deletions(-) (limited to 'mm/memory.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 935383081397..b5b2523c80af 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1643,27 +1643,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) return (unsigned long)val; } -void mm_trace_rss_stat(int member, long count); +void mm_trace_rss_stat(struct mm_struct *mm, int member, long count); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { long count = atomic_long_add_return(value, &mm->rss_stat.count[member]); - mm_trace_rss_stat(member, count); + mm_trace_rss_stat(mm, member, count); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { long count = atomic_long_inc_return(&mm->rss_stat.count[member]); - mm_trace_rss_stat(member, count); + mm_trace_rss_stat(mm, member, count); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { long count = atomic_long_dec_return(&mm->rss_stat.count[member]); - mm_trace_rss_stat(member, count); + mm_trace_rss_stat(mm, member, count); } /* Optimized variant when page is already known not to be PageAnon */ diff --git a/include/linux/string.h b/include/linux/string.h index b6ccdc2c7f02..02894e417565 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -216,6 +216,8 @@ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, const void *from, size_t available); +int ptr_to_hashval(const void *ptr, unsigned long *hashval_out); + /** * strstarts - does @str start with @prefix? * @str: string to examine diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 5a0666bfcf85..ad7e642bd497 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -316,24 +316,50 @@ TRACE_EVENT(mm_page_alloc_extfrag, __entry->change_ownership) ); +/* + * Required for uniquely and securely identifying mm in rss_stat tracepoint. + */ +#ifndef __PTR_TO_HASHVAL +static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr) +{ + int ret; + unsigned long hashval; + + ret = ptr_to_hashval(ptr, &hashval); + if (ret) + return 0; + + /* The hashed value is only 32-bit */ + return (unsigned int)hashval; +} +#define __PTR_TO_HASHVAL +#endif + TRACE_EVENT(rss_stat, - TP_PROTO(int member, + TP_PROTO(struct mm_struct *mm, + int member, long count), - TP_ARGS(member, count), + TP_ARGS(mm, member, count), TP_STRUCT__entry( + __field(unsigned int, mm_id) + __field(unsigned int, curr) __field(int, member) __field(long, size) ), TP_fast_assign( + __entry->mm_id = mm_ptr_to_hash(mm); + __entry->curr = !!(current->mm == mm); __entry->member = member; __entry->size = (count << PAGE_SHIFT); ), - TP_printk("member=%d size=%ldB", + TP_printk("mm_id=%u curr=%d member=%d size=%ldB", + __entry->mm_id, + __entry->curr, __entry->member, __entry->size) ); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index dee8fc467fcf..7c488a1ce318 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -761,11 +761,38 @@ static int __init initialize_ptr_random(void) early_initcall(initialize_ptr_random); /* Maps a pointer to a 32 bit unique identifier. */ +static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out) +{ + unsigned long hashval; + + if (static_branch_unlikely(¬_filled_random_ptr_key)) + return -EAGAIN; + +#ifdef CONFIG_64BIT + hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key); + /* + * Mask off the first 32 bits, this makes explicit that we have + * modified the address (and 32 bits is plenty for a unique ID). + */ + hashval = hashval & 0xffffffff; +#else + hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key); +#endif + *hashval_out = hashval; + return 0; +} + +int ptr_to_hashval(const void *ptr, unsigned long *hashval_out) +{ + return __ptr_to_hashval(ptr, hashval_out); +} + static char *ptr_to_id(char *buf, char *end, const void *ptr, struct printf_spec spec) { const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)"; unsigned long hashval; + int ret; /* When debugging early boot use non-cryptographically secure hash. */ if (unlikely(debug_boot_weak_hash)) { @@ -773,22 +800,13 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr, return pointer_string(buf, end, (const void *)hashval, spec); } - if (static_branch_unlikely(¬_filled_random_ptr_key)) { + ret = __ptr_to_hashval(ptr, &hashval); + if (ret) { spec.field_width = 2 * sizeof(ptr); /* string length must be less than default_width */ return error_string(buf, end, str, spec); } -#ifdef CONFIG_64BIT - hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key); - /* - * Mask off the first 32 bits, this makes explicit that we have - * modified the address (and 32 bits is plenty for a unique ID). - */ - hashval = hashval & 0xffffffff; -#else - hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key); -#endif return pointer_string(buf, end, (const void *)hashval, spec); } diff --git a/mm/memory.c b/mm/memory.c index 57c910aaba45..62b5cce653f6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -154,9 +154,9 @@ static int __init init_zero_pfn(void) } core_initcall(init_zero_pfn); -void mm_trace_rss_stat(int member, long count) +void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) { - trace_rss_stat(member, count); + trace_rss_stat(mm, member, count); } #if defined(SPLIT_RSS_COUNTING) -- cgit v1.2.3 From 625110b5e9dae9074d8a7e67dd07f821a053eed7 Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Sat, 30 Nov 2019 17:51:32 -0800 Subject: mm/memory.c: fix a huge pud insertion race during faulting A huge pud page can theoretically be faulted in racing with pmd_alloc() in __handle_mm_fault(). That will lead to pmd_alloc() returning an invalid pmd pointer. Fix this by adding a pud_trans_unstable() function similar to pmd_trans_unstable() and check whether the pud is really stable before using the pmd pointer. Race: Thread 1: Thread 2: Comment create_huge_pud() Fallback - not taken. create_huge_pud() Taken. pmd_alloc() Returns an invalid pointer. This will result in user-visible huge page data corruption. Note that this was caught during a code audit rather than a real experienced problem. It looks to me like the only implementation that currently creates huge pud pagetable entries is dev_dax_huge_fault() which doesn't appear to care much about private (COW) mappings or write-tracking which is, I believe, a prerequisite for create_huge_pud() falling back on thread 1, but not in thread 2. Link: http://lkml.kernel.org/r/20191115115808.21181-2-thomas_os@shipmail.org Fixes: a00cc7d9dd93 ("mm, x86: add support for PUD-sized transparent hugepages") Signed-off-by: Thomas Hellstrom Acked-by: Kirill A. Shutemov Cc: Arnd Bergmann Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 25 +++++++++++++++++++++++++ mm/memory.c | 6 ++++++ 2 files changed, 31 insertions(+) (limited to 'mm/memory.c') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 3127f9028f54..798ea36a0549 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -938,6 +938,31 @@ static inline int pud_trans_huge(pud_t pud) } #endif +/* See pmd_none_or_trans_huge_or_clear_bad for discussion. */ +static inline int pud_none_or_trans_huge_or_dev_or_clear_bad(pud_t *pud) +{ + pud_t pudval = READ_ONCE(*pud); + + if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval)) + return 1; + if (unlikely(pud_bad(pudval))) { + pud_clear_bad(pud); + return 1; + } + return 0; +} + +/* See pmd_trans_unstable for discussion. */ +static inline int pud_trans_unstable(pud_t *pud) +{ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + return pud_none_or_trans_huge_or_dev_or_clear_bad(pud); +#else + return 0; +#endif +} + #ifndef pmd_read_atomic static inline pmd_t pmd_read_atomic(pmd_t *pmdp) { diff --git a/mm/memory.c b/mm/memory.c index 62b5cce653f6..c3902201989f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4010,6 +4010,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, vmf.pud = pud_alloc(mm, p4d, address); if (!vmf.pud) return VM_FAULT_OOM; +retry_pud: if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) @@ -4036,6 +4037,11 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; + + /* Huge pud page fault raced with pmd_alloc? */ + if (pud_trans_unstable(vmf.pud)) + goto retry_pud; + if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) -- cgit v1.2.3 From f4f5329d453704e2214011ecf00db73cd3196d06 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:58:17 -0800 Subject: mm: fix typos in comments when calling __SetPageUptodate() There are several places emphasise the effect of __SetPageUptodate(), while the comment seems to have a typo in two places. Link: http://lkml.kernel.org/r/20190926023705.7226-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- mm/userfaultfd.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index c3902201989f..513c3ecc76ee 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3105,7 +3105,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* * The memory barrier inside __SetPageUptodate makes sure that - * preceeding stores to the page contents become visible before + * preceding stores to the page contents become visible before * the set_pte_at() write. */ __SetPageUptodate(page); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index a10aa8563e41..1b0d7abad1d4 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -90,7 +90,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, /* * The memory barrier inside __SetPageUptodate makes sure that - * preceeding stores to the page contents become visible before + * preceding stores to the page contents become visible before * the set_pte_at() write. */ __SetPageUptodate(page); -- cgit v1.2.3